#############################################################
# Data prep
library("rio")
x <- import("https://docs.google.com/spreadsheets/d/1h7XhLd2Byp4OcXSdtxHly9iS7RA673RJ/export?format=csv&gid=1083477596")
x$commentCount <- as.integer(x$commentCount)
x$viewsCount <- as.numeric(x$viewsCount)
x$acousticness <- as.numeric(sub(",", ".", x$acousticness, fixed = TRUE))
x$danceability <- as.numeric(sub(",", ".", x$danceability, fixed = TRUE))
x$energy <- as.numeric(sub(",", ".", x$energy, fixed = TRUE))
x$instrumentalness <- as.numeric(sub(",", ".", x$instrumentalness, fixed = TRUE))
x$liveness <- as.numeric(sub(",", ".", x$liveness, fixed = TRUE))
x$loudness <- as.numeric(sub(",", ".", x$loudness, fixed = TRUE))
x$speechiness <- as.numeric(sub(",", ".", x$speechiness, fixed = TRUE))
x$tempo <- as.numeric(sub(",", ".", x$tempo, fixed = TRUE))
x$valence <- as.numeric(sub(",", ".", x$valence, fixed = TRUE))
x$key <- as.factor(x$key)
x$time_signature <- as.factor(x$time_signature)
min_max_normalize <- function(x)
{
return( (1000-10)*((x- min(x)) /(max(x)-min(x))) + 10)
}
x$acousticness <- min_max_normalize(x$acousticness)
x$danceability <- min_max_normalize(x$danceability)
x$energy <- min_max_normalize(x$energy)
x$instrumentalness <- min_max_normalize(x$instrumentalness)
x$liveness <- min_max_normalize(x$liveness)
x$loudness <- min_max_normalize(x$loudness)
x$speechiness <- min_max_normalize(x$speechiness)
x$tempo <- min_max_normalize(x$tempo)
x$valence <- min_max_normalize(x$valence)
x$Genre <- as.factor(x$Genre)
x$key <- as.factor(x$key)
x$mode <- as.factor(x$mode)
x$Charts <- as.factor(x$Charts)
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
x <- dplyr::select(x, -one_of('Streams'))
x <- x[complete.cases(x), ]
var_list <- c('Artist_Albums_Number', 'Artist_Albums_Tracks_Number', 'Artist_Appearances_Number', 'Artist_Appearances_Tracks_Number', 'Artist_Compilations_Number', 'Artist_Compilations_Tracks_Number', 'Artist_Follower', 'Artist_Popularity', 'Artist_Singles_Number', 'Artist_Singles_Tracks_Number', 'Track_Duration_ms', 'Track_Popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'commentCount', 'dislikeCount', 'likeCount', 'viewsCount')
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for (i in 1:length(var_list)){
hist(x[, var_list[i]], probability = TRUE, col = "gray", main = var_list[i], xlab = "")
lines(density(x[, var_list[i]]), col = "red")
}
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for(i in 1:length(var_list)){
df <- data.frame(x[,var_list[i]])
names(df)[1] <- var_list[i]
#ggplot(df, aes(x="",y=Artist_Albums_Number)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.colour="black", outlier.shape=16, #outlier.size=2, notch=FALSE)
boxplot(x = df[,1], main = var_list[i], notch = FALSE)
}
Normality testing
#strictly_positive_variables <- c('Artist_Follower', 'Artist_Popularity', 'Track_Duration_ms', 'Track_Popularity', 'viewsCount', #'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
for (i in 1:length(var_list)){
column_name <- var_list[i]
message(column_name)
sub_df <- as.numeric(x[,column_name])
# sub_df <- sub_df[complete.cases(sub_df), ]
#sub_df <- as.numeric(as.character(unlist(sub_df[[1]])))
test_statistic <- ks.test(sub_df, "pnorm", mean=mean(sub_df), sd=sd(sub_df))$statistic
critical_value <- 1.3581 / sqrt (length(sub_df))
if (test_statistic > critical_value) {
message(column_name)
} else {
message(paste(" ", column_name , " is approximately normally distributed!", test_statistic, critical_value))
}}
## Artist_Albums_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Number
## Artist_Albums_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Tracks_Number
## Artist_Appearances_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Number
## Artist_Appearances_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Tracks_Number
## Artist_Compilations_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Number
## Artist_Compilations_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Tracks_Number
## Artist_Follower
## Artist_Follower
## Artist_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Popularity is approximately normally distributed! 0.0579112990955349 0.0970071428571429
## Artist_Singles_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Number
## Artist_Singles_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Tracks_Number
## Track_Duration_ms
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Duration_ms
## Track_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Popularity
## acousticness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## acousticness
## danceability
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## danceability
## energy
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## energy
## instrumentalness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## instrumentalness
## liveness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## liveness
## loudness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## loudness
## speechiness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## speechiness
## tempo
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## tempo
## valence
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## valence
## commentCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## commentCount
## dislikeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## dislikeCount
## likeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## likeCount
## viewsCount
## viewsCount
x_scaled <- as.data.frame(scale(x[, var_list]))
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for (i in 1:length(var_list)){
hist(x_scaled[, var_list[i]], probability = TRUE, col = "gray", main = var_list[i], xlab = "")
lines(density(x_scaled[, var_list[i]]), col = "red")
}
par(mar = rep(2, 4))
par(mfrow=c(5,5))
for(i in 1:length(var_list)){
df <- data.frame(x_scaled[,var_list[i]])
names(df)[1] <- var_list[i]
#ggplot(df, aes(x="",y=Artist_Albums_Number)) + stat_boxplot(geom ='errorbar') + geom_boxplot(outlier.colour="black", outlier.shape=16, #outlier.size=2, notch=FALSE)
boxplot(x = df[,1], main = var_list[i], notch = FALSE)
}
for (i in 1:length(var_list)){
column_name <- var_list[i]
message(column_name)
sub_df <- as.numeric(x_scaled[,column_name])
# sub_df <- sub_df[complete.cases(sub_df), ]
#sub_df <- as.numeric(as.character(unlist(sub_df[[1]])))
test_statistic <- ks.test(sub_df, "pnorm", mean=mean(sub_df), sd=sd(sub_df))$statistic
critical_value <- 1.3581 / sqrt (length(sub_df))
if (test_statistic > critical_value) {
message(column_name)
} else {
message(paste(" ", column_name , " is approximately normally distributed!", test_statistic, critical_value))
}}
## Artist_Albums_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Number
## Artist_Albums_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Albums_Tracks_Number
## Artist_Appearances_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Number
## Artist_Appearances_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Appearances_Tracks_Number
## Artist_Compilations_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Number
## Artist_Compilations_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Compilations_Tracks_Number
## Artist_Follower
## Artist_Follower
## Artist_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Popularity is approximately normally distributed! 0.0579112990955349 0.0970071428571429
## Artist_Singles_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Number
## Artist_Singles_Tracks_Number
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Artist_Singles_Tracks_Number
## Track_Duration_ms
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Duration_ms
## Track_Popularity
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## Track_Popularity
## acousticness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## acousticness
## danceability
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## danceability
## energy
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## energy
## instrumentalness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## instrumentalness
## liveness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## liveness
## loudness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## loudness
## speechiness
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## speechiness
## tempo
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## tempo
## valence
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## valence
## commentCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## commentCount
## dislikeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## dislikeCount
## likeCount
## Warning in ks.test(sub_df, "pnorm", mean = mean(sub_df), sd = sd(sub_df)): ties
## should not be present for the Kolmogorov-Smirnov test
## likeCount
## viewsCount
## viewsCount
strictly_positive_variables <- c('Artist_Follower', 'Artist_Popularity', 'Artist_Singles_Number', 'Artist_Singles_Tracks_Number', 'Track_Duration_ms', 'Track_Popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'likeCount', 'viewsCount')
library("psych")
library("car")
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
ksD <- function (p, x) {
y <- bcPower(x, p)
ks.test(y, "pnorm", mean=mean(y), sd=sd(y))$statistic
}
oldw <- getOption("warn")
options(warn = -1)
min_values <- c()
for (column_index in 1:length(strictly_positive_variables)){
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
result <- optimize(ksD, c(-5,5), x=x_sub)
min_values[column_index] <- result$minimum
message(paste(column_index, ', minimum value is: ', result$minimum))
}
## 1 , minimum value is: -0.0049316395173003
## 2 , minimum value is: 1.40215894049501
## 3 , minimum value is: 0.240525077020065
## 4 , minimum value is: 0.262671909258923
## 5 , minimum value is: -0.699387491055179
## 6 , minimum value is: 0.949365177476973
## 7 , minimum value is: 0.130600084313768
## 8 , minimum value is: 3.51730755217668
## 9 , minimum value is: 3.07785781966221
## 10 , minimum value is: -0.795345146822112
## 11 , minimum value is: -0.303366878680729
## 12 , minimum value is: 4.64767393430379
## 13 , minimum value is: -0.403363137393642
## 14 , minimum value is: 1.16379372652619
## 15 , minimum value is: 0.453314295028104
## 16 , minimum value is: 0.05466614868127
## 17 , minimum value is: 0.0345455532264414
options(warn = oldw)
column_index <- 1
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Follower_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 2
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Popularity_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 3
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Singles_Number_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 4
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Artist_Singles_Tracks_Number_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 5
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Track_Duration_ms_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 6
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
Track_Popularity_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 7
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
acousticness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 8
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
danceability_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 9
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
energy_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 10
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
instrumentalness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 11
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
liveness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 12
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
loudness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 13
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
speechiness_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 14
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
tempo_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 15
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
valence_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 16
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
likeCount_trans <- bcPower(x_sub, min_values[column_index])
column_index <- 17
column_name <- strictly_positive_variables[column_index]
x_sub <- x[[paste(column_name)]]
viewsCount_trans <- bcPower(x_sub, min_values[column_index])
hist_trans_list <- list(Artist_Follower_trans,
Artist_Popularity_trans,
Artist_Singles_Number_trans,
Artist_Singles_Tracks_Number_trans,
Track_Duration_ms_trans,
Track_Popularity_trans,
acousticness_trans,
danceability_trans,
energy_trans,
instrumentalness_trans,
liveness_trans,
loudness_trans,
speechiness_trans,
tempo_trans,
valence_trans,
likeCount_trans,
viewsCount_trans)
par(mar = rep(2, 4))
par(mfrow=c(4,5))
for (trans_index in 1:length(hist_trans_list)){
column_name <- strictly_positive_variables[trans_index]
selected_trans <- hist_trans_list[trans_index]
selected_trans <- as.numeric(as.character(unlist(selected_trans[[1]])))
hist(selected_trans, col = "gray", probability = TRUE, main = column_name, xlab = "")
points(seq(min(selected_trans), max(selected_trans), length.out = 500),
dnorm(seq(min(selected_trans), max(selected_trans), length.out = 500),
mean(selected_trans),sd(selected_trans)), type = "l", col = "red")
test_statistic <- ks.test(selected_trans, "pnorm", mean=mean(selected_trans), sd=sd(selected_trans))$statistic
critical_value <- 1.3581 / sqrt (length(selected_trans))
if (test_statistic > critical_value) {
message(paste("Transformed ", column_name , " is not approximately normally distributed.", test_statistic, critical_value))
} else {
message(paste("Transformed ", column_name , " is approximately normally distributed!", test_statistic, critical_value))
}}
## Transformed Artist_Follower is approximately normally distributed! 0.0485813205231013 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed Artist_Popularity is approximately normally distributed! 0.047962859540898 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed Artist_Singles_Number is approximately normally distributed! 0.0639302516161726 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed Artist_Singles_Tracks_Number is approximately normally distributed! 0.0334178939616578 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed Track_Duration_ms is not approximately normally distributed. 0.0998436682047523 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed Track_Popularity is not approximately normally distributed. 0.108864892250361 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed acousticness is not approximately normally distributed. 0.108064790247887 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed danceability is not approximately normally distributed. 0.113440523942993 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed energy is not approximately normally distributed. 0.114310681616891 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed instrumentalness is not approximately normally distributed. 0.285896259482658 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed liveness is not approximately normally distributed. 0.103366239896501 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed loudness is approximately normally distributed! 0.0958614307678946 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed speechiness is approximately normally distributed! 0.0660473903454177 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed tempo is approximately normally distributed! 0.0841680160106226 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed valence is approximately normally distributed! 0.0879386163770323 0.0970071428571429
## Warning in ks.test(selected_trans, "pnorm", mean = mean(selected_trans), : ties
## should not be present for the Kolmogorov-Smirnov test
## Transformed likeCount is approximately normally distributed! 0.0696106433502905 0.0970071428571429
## Transformed viewsCount is approximately normally distributed! 0.0770680692211038 0.0970071428571429
par(mar = rep(2, 4))
par(mfrow=c(4,5))
for(trans_index in 1:length(strictly_positive_variables)){
column_name <- strictly_positive_variables[trans_index]
selected_trans <- hist_trans_list[trans_index]
selected_trans <- as.numeric(as.character(unlist(selected_trans[[1]])))
df <- data.frame(selected_trans)
names(df)[1] <- strictly_positive_variables[trans_index]
boxplot(x = df[,1], main = strictly_positive_variables[trans_index], notch = FALSE)
}
Correlation
library(corrplot)
## corrplot 0.84 loaded
method <- "pearson"
clean_cor <- cor(x[,var_list], method = method)
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ..., method = method)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# matrix of the p-value of the correlation
p.mat <- cor.mtest(clean_cor)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
significance_level <- 0.05
corrplot(clean_cor, method="color", col=col(200),
type="upper", order="alphabet",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=90, #Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = significance_level, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE)
Steiger’s Z test
#
lm1 <- lm(x$Artist_Popularity ~ x$Artist_Follower)
summary(lm1)
##
## Call:
## lm(formula = x$Artist_Popularity ~ x$Artist_Follower)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.796 -9.761 2.344 10.590 27.413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.268e+01 1.029e+00 60.900 < 2e-16 ***
## x$Artist_Follower 1.216e-06 1.491e-07 8.154 4.3e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.72 on 194 degrees of freedom
## Multiple R-squared: 0.2552, Adjusted R-squared: 0.2514
## F-statistic: 66.49 on 1 and 194 DF, p-value: 4.298e-14
lm2 <- lm(Artist_Popularity_trans ~ Artist_Follower_trans)
summary(lm2)
##
## Call:
## lm(formula = Artist_Popularity_trans ~ Artist_Follower_trans)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.767 -23.106 -7.232 23.326 121.213
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -108.514 12.039 -9.014 <2e-16 ***
## Artist_Follower_trans 31.226 1.017 30.713 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.89 on 194 degrees of freedom
## Multiple R-squared: 0.8294, Adjusted R-squared: 0.8285
## F-statistic: 943.3 on 1 and 194 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(x$Artist_Follower, x$Artist_Popularity, main = paste("Untransformed, R^2=", round(summary(lm1)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity")
plot(Artist_Follower_trans, Artist_Popularity_trans, main = paste("Box-Cox transformed, R^2=", round(summary(lm2)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity")
bivariate_df_base <- data.frame(x$Artist_Follower, x$Artist_Popularity)
bivariate_df_bc <- data.frame(Artist_Follower_trans, Artist_Popularity_trans)
library("MVN")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
MVN::mvn(bivariate_df_base, mvnTest = "mardia")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "hz")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "royston")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_base, mvnTest = "energy")$multivariateNormality # Jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "mardia")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "hz")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "royston")$multivariateNormality # Not jointly normal
MVN::mvn(bivariate_df_bc, mvnTest = "energy")$multivariateNormality # Jointly normal
boxcox_df <- data.frame(matrix(vector(), nrow = 196, ncol = 17))
boxcox_df$Artist_Follower <- Artist_Follower_trans
boxcox_df$Artist_Popularity <- Artist_Popularity_trans
boxcox_df$Artist_Singles_Number <- Artist_Singles_Number_trans
boxcox_df$Artist_Singles_Tracks_Number <- Artist_Singles_Tracks_Number_trans
boxcox_df$Track_Duration_ms <- Track_Duration_ms_trans
boxcox_df$Track_Popularity <- Track_Popularity_trans
boxcox_df$acousticness <- acousticness_trans
boxcox_df$danceability <- danceability_trans
boxcox_df$energy <- energy_trans
boxcox_df$instrumentalness <- instrumentalness_trans
boxcox_df$liveness <- liveness_trans
boxcox_df$loudness <- loudness_trans
boxcox_df$speechiness <- speechiness_trans
boxcox_df$tempo <- tempo_trans
boxcox_df$valence <- valence_trans
boxcox_df$likeCount <- likeCount_trans
boxcox_df$viewsCount <- viewsCount_trans
drop.cols <- c('X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17')
boxcox_df <- dplyr::select(boxcox_df, -one_of(drop.cols))
par(mfrow=c(1,1))
palette <- c('red', 'blue', 'black', 'purple')
# palette <- rainbow(length(levels(as.factor(x$Genre))))
my_colors <- palette[as.numeric(x$Genre)]
plot(Artist_Follower_trans, Artist_Popularity_trans, main = paste("Box-Cox transformed, R^2=", round(summary(lm2)$r.squared, digits =2)), xlab = "Artist_Follower", ylab = "Artist_Popularity", col = my_colors, pch = 16)
legend("bottomright", legend = levels(as.factor(x$Genre)), col = palette, pch = 16)
library("lattice")
xyplot(Artist_Popularity_trans~Artist_Follower_trans|x$Genre, pch=19, xlab = 'viewsCount (transformed)', ylab = 'Artist_Popularity (transformed)', main = "Biplot")
# pairs(boxcox_df)
sunflower_Artist_pop <- 2*round(boxcox_df$Artist_Popularity/2)
sunflower_viewsCount <- 2*round(boxcox_df$viewsCount/2)
sunflowerplot(sunflower_Artist_pop~sunflower_viewsCount, xlab = 'viewsCount (transformed)', ylab = 'Artist_Popularity (transformed)', main = 'Sunflower plot')
require("MASS")
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
parcoord(boxcox_df, var.label = FALSE)
#library("lattice")
#parallelplot(boxcox_df, horizontal.axis=T)
#library("ggplot2")
#library("GGally")
#ggparcoord(boxcox_df) + geom_line()
library("andrews")
andrews(boxcox_df, ymax=7)
audio_features <- c('acousticness' , 'danceability' ,'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
selected_columns <- dplyr::select(boxcox_df, audio_features)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(audio_features)` instead of `audio_features` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
parcoord(selected_columns , col= my_colors, var.label = FALSE, ... = par(las=2, mar = c(8,1,3,1), xpd=TRUE))
## Warning in plot.window(...): "..." ist kein Grafikparameter
## Warning in plot.xy(xy, type, ...): "..." ist kein Grafikparameter
## Warning in title(...): "..." ist kein Grafikparameter
# title("Parallel coordinates", line = 0.6, adj = 0.1)
legend("topright", inset=c(0.035,-0.08),legend = c('Classic', 'Techno', 'Pop', 'Hip Hop'), col = unique(my_colors),
bty = 'o', horiz = TRUE, pch = 15)
ax <- selected_columns
ax$clr <- x$Genre
andrews(ax, ymax=4, clr=10)
legend("topright",legend=levels(ax$clr), fill=rainbow(length(levels(as.factor(ax$clr)))))
library("scagnostics")
## Loading required package: rJava
sub_df <- as.data.frame(x[, var_list])
s <- scagnostics(sub_df)
o <- scagnosticsOutliers(s)
g <- scagnosticsGrid(s)
go <- g[o,]
# Outliers: strongly different from the other plots
par(mfrow=c(1,1))
for (i in 1:nrow(go)){
plot(sub_df[[go[i,1]]], sub_df[[go[i,2]]], pch=19, xlab=names(sub_df)[go[i,1]], ylab=names(sub_df)[go[i,2]])
print(paste("Outlying pair: ", "y=", names(sub_df)[go[i,2]], "x=", names(sub_df)[go[i,1]]))}
## [1] "Outlying pair: y= Artist_Compilations_Number x= Artist_Albums_Number"
## [1] "Outlying pair: y= Artist_Compilations_Number x= Artist_Albums_Tracks_Number"
## [1] "Outlying pair: y= Artist_Compilations_Tracks_Number x= Artist_Albums_Number"
## [1] "Outlying pair: y= Artist_Compilations_Tracks_Number x= Artist_Compilations_Number"
## [1] "Outlying pair: y= Artist_Popularity x= Artist_Follower"
## [1] "Outlying pair: y= Artist_Singles_Number x= Artist_Albums_Number"
## [1] "Outlying pair: y= Artist_Singles_Number x= Artist_Compilations_Tracks_Number"
## [1] "Outlying pair: y= Artist_Singles_Tracks_Number x= Artist_Compilations_Tracks_Number"
## [1] "Outlying pair: y= Artist_Singles_Tracks_Number x= Artist_Singles_Number"
## [1] "Outlying pair: y= instrumentalness x= Track_Duration_ms"
## [1] "Outlying pair: y= loudness x= Artist_Albums_Tracks_Number"
## [1] "Outlying pair: y= valence x= instrumentalness"
## [1] "Outlying pair: y= dislikeCount x= Artist_Follower"
## [1] "Outlying pair: y= likeCount x= Artist_Follower"
## [1] "Outlying pair: y= likeCount x= commentCount"
# Highlights:
par(mfrow=c(1,2))
plot(x$Track_Duration_ms, x$instrumentalness, col = my_colors, pch = 16, xlab = 'Track_Duration_ms', ylab = 'instrumentalness')
plot(x$instrumentalness, x$valence, col = my_colors, pch = 16, xlab = 'instrumentalness', ylab = 'valence')
legend(x=-1300, y=1250, legend = levels(as.factor(x$Genre)), col = palette, pch = 16, horiz = TRUE, xpd = NA)
# Exemplars: group of similar plots
e <- scagnosticsExemplars(s)
ge <- g[e,]
par(mfrow = c(2,2))
for (i in 1:dim(ge)[1]){
plot(sub_df[[ge$x[i]]], sub_df[[ge$y[i]]], pch=19,
xlab=names(sub_df)[ge$x[i]], ylab=names(sub_df)[ge$y[i]])
print(paste("Exemplary pair: ", "y=", names(sub_df)[ge$y[i]], "x=", names(sub_df)[ge$x[i]]))
}
## [1] "Exemplary pair: y= Track_Popularity x= Artist_Appearances_Tracks_Number"
## [1] "Exemplary pair: y= energy x= Artist_Albums_Tracks_Number"
## [1] "Exemplary pair: y= likeCount x= Artist_Appearances_Number"
## [1] "Exemplary pair: y= viewsCount x= tempo"
table(x$Genre)
##
## Classic Hip Hop Pop Techno
## 49 50 50 47
x$Artist_Popularity_quantile <- 0
Artist_Popularity_quantiles <- quantile(x$Artist_Popularity, probs = c(0.25, 0.5, 0.75))
Artist_Pop_q_25 <- Artist_Popularity_quantiles[1]
Artist_Pop_median <- Artist_Popularity_quantiles[2]
Artist_Pop_q_75 <- Artist_Popularity_quantiles[3]
x$Artist_Popularity_quantile <- ifelse(x$Artist_Popularity < Artist_Pop_q_25, 1, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- ifelse(((x$Artist_Popularity >= Artist_Pop_q_25) & (x$Artist_Popularity < Artist_Pop_median)), 2, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- ifelse(((x$Artist_Popularity >= Artist_Pop_median) & (x$Artist_Popularity < Artist_Pop_q_75)), 3, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- ifelse((x$Artist_Popularity >= Artist_Pop_q_75), 4, x$Artist_Popularity_quantile + 0)
x$Artist_Popularity_quantile <- as.factor(x$Artist_Popularity_quantile)
tab<-table(x$Genre, x$Artist_Popularity_quantile)
tab
##
## 1 2 3 4
## Classic 9 21 13 6
## Hip Hop 0 8 25 17
## Pop 1 8 10 31
## Techno 37 10 0 0
# r = 4, c = 4
#df = (r-1)*(c-1) = 3+3 = 9
# critical value: 16.919
chisq.test(tab)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 156.23, df = 9, p-value < 2.2e-16
chisq.test(tab, simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: tab
## X-squared = 156.23, df = NA, p-value = 0.0004998
library("vcd")
## Loading required package: grid
assocstats(tab)
## X^2 df P(> X^2)
## Likelihood Ratio 168.67 9 0
## Pearson 156.23 9 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.666
## Cramer's V : 0.515
#library("openintro")
#contTable(tab)
x$viewsCount_quantile <- 0
viewsCount_quantiles <- quantile(x$viewsCount, probs = c(0.25, 0.5, 0.75))
viewsCount_q_25 <- viewsCount_quantiles[1]
viewsCount_median <- viewsCount_quantiles[2]
viewsCount_q_75 <- viewsCount_quantiles[3]
x$viewsCount_quantile <- ifelse(x$viewsCount < viewsCount_q_25, 1, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- ifelse(((x$viewsCount >= viewsCount_q_25) & (x$viewsCount < viewsCount_median)), 2, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- ifelse(((x$viewsCount >= viewsCount_median) & (x$viewsCount < viewsCount_q_75)), 3, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- ifelse((x$viewsCount >= viewsCount_q_75), 4, x$viewsCount_quantile + 0)
x$viewsCount_quantile <- as.factor(x$viewsCount_quantile)
tab<-table(x$viewsCount_quantile, x$Artist_Popularity_quantile)
tab
##
## 1 2 3 4
## 1 36 6 4 3
## 2 10 19 12 8
## 3 1 18 19 11
## 4 0 4 13 32
chisq.test(tab)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 133.34, df = 9, p-value < 2.2e-16
chisq.test(tab, simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: tab
## X-squared = 133.34, df = NA, p-value = 0.0004998
library("vcd")
assocstats(tab)
## X^2 df P(> X^2)
## Likelihood Ratio 133.48 9 0
## Pearson 133.34 9 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.636
## Cramer's V : 0.476
#contTable(tab)
ab2 <- na.omit(cbind(x$Artist_Popularity_quantile, x$viewsCount_quantile))
nrow(ab2)*(nrow(ab2)-1)/2
## [1] 19110
#
ind <- order(ab2[,1], ab2[,2])
ab2 <- ab2[ind,]
#b
C <- D <- Tx <- Ty <- Txy <- 0
for (i in 1:(nrow(ab2)-1)) {
if (i%%100==0) cat(i, "\n")
for(j in (i+1):nrow(ab2)) {
if (ab2[i,1]==ab2[j,1]) {
if (ab2[i,2]==ab2[j,2]) {
Txy <- Txy+1
} else {
Tx <- Tx+(ab2[i,2]<ab2[j,2])
}
} else {
if (ab2[i,2]==ab2[j,2]) Ty <- Ty+1
if (ab2[i,2]<ab2[j,2]) C <- C+1
if (ab2[i,2]>ab2[j,2]) D <- D+1
}
}
}
## 100
c(C=C, D=D, Tx=Tx, Ty=Ty, Txy=Txy)
## C D Tx Ty Txy
## 10048 1560 2798 2781 1923
k_t <- (C - D)/(nrow(ab2)*(nrow(ab2)-1)/2)
k_t # (without ties)
## [1] 0.4441654
library("ryouready")
ord.tau(table(ab2[,1], ab2[,2]))
## Kendall's (and Stuart's) Tau statistics
## Tau-b: 0.590
## Tau-c: 0.589
cor(as.numeric(x$Artist_Popularity_quantile), as.numeric(x$viewsCount_quantile), method = "kendall") # identical result
## [1] 0.5895469
cor.test(as.numeric(x$Artist_Popularity_quantile), as.numeric(x$viewsCount_quantile), method="kendall")
##
## Kendall's rank correlation tau
##
## data: as.numeric(x$Artist_Popularity_quantile) and as.numeric(x$viewsCount_quantile)
## z = 9.8748, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.5895469
modetab <- function(x, margin=0) {
if (margin>0) apply(x, margin, max) else max(x)
}
tab2 <- table(x$Artist_Popularity_quantile,
x$Genre, dnn = c("Artist Popularity quantile", "Genre"))
tab2
## Genre
## Artist Popularity quantile Classic Hip Hop Pop Techno
## 1 9 0 1 37
## 2 21 8 8 10
## 3 13 25 10 0
## 4 6 17 31 0
# contTable(tab2)
tab <- rowSums(tab2)
message(paste("The mode across all four quantiles of stream distribution is", modetab(tab), " and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution."))
## The mode across all four quantiles of stream distribution is 54 and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution.
e1_streams <- sum(tab) - modetab(tab)
message(paste("Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: ", e1_streams, " or ", e1_streams/sum(tab)*100, "%."))
## Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: 142 or 72.4489795918367 %.
e2_streams <- sum(tab2) - sum(modetab(tab2, 2))
message(paste("Now having knowledge about an association between artist popularity and its genre and using class-internal modes for the additional feature, the number of falsely predicted cases would be: ", e2_streams, " or ", e2_streams/sum(tab)*100, "%, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, ", sum(tab2)-e2_streams, " or ", (sum(tab2)-e2_streams)/sum(tab2)*100, "% would have been predicted correctly (compared to ", (1-e1_streams/sum(tab))*100, "% if there was no knowledge about genre.)"))
## Now having knowledge about an association between artist popularity and its genre and using class-internal modes for the additional feature, the number of falsely predicted cases would be: 82 or 41.8367346938776 %, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, 114 or 58.1632653061224 % would have been predicted correctly (compared to 27.5510204081633 % if there was no knowledge about genre.)
lambda <- (e1_streams - e2_streams)/e1_streams
message(paste("Therefore, Goodmann and Kruskals Lambda is: ", lambda))
## Therefore, Goodmann and Kruskals Lambda is: 0.422535211267606
library("ryouready")
nom.lambda(tab2)
## Lambda:
## Columns dependent: 0.438
## Rows dependent: 0.423
## Symmetric: 0.431
modetab <- function(x, margin=0) {
if (margin>0) apply(x, margin, max) else max(x)
}
tab2 <- table(x$Artist_Popularity_quantile,
x$viewsCount_quantile, dnn = c("Artist Popularity quantile", "viewsCount quantile"))
tab2
## viewsCount quantile
## Artist Popularity quantile 1 2 3 4
## 1 36 10 1 0
## 2 6 19 18 4
## 3 4 12 19 13
## 4 3 8 11 32
tab <- rowSums(tab2)
message(paste("The mode across all four quantiles of stream distribution is", modetab(tab), " and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution."))
## The mode across all four quantiles of stream distribution is 54 and is in the 4th quantile. So the best prediction for a new artist without further knowledge is that she will fall in the top quantile of the popularity distribution.
e1_streams <- sum(tab) - modetab(tab)
message(paste("Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: ", e1_streams, " or ", e1_streams/sum(tab)*100, "%."))
## Without any knowledge about an additional feature other than artist popularity quantile itself and using the mode across classes as best predictor for class assignment for a new observation, the number of falsely predicted cases would be: 142 or 72.4489795918367 %.
e2_streams <- sum(tab2) - sum(modetab(tab2, 2))
message(paste("Now having knowledge about an association between artist popularity and its viewsCount quantile and using class-internal modes for the additional feature, the number of falsely predicted cases would be: ", e2_streams, " or ", e2_streams/sum(tab)*100, "%, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, ", sum(tab2)-e2_streams, " or ", (sum(tab2)-e2_streams)/sum(tab2)*100, "% would have been predicted correctly (compared to ", (1-e1_streams/sum(tab))*100, "% if there was no knowledge about viewsCount quantile.)"))
## Now having knowledge about an association between artist popularity and its viewsCount quantile and using class-internal modes for the additional feature, the number of falsely predicted cases would be: 90 or 45.9183673469388 %, which is already much lower compared to the error rate from predicting without any knowledge about an association. On the other hand, 106 or 54.0816326530612 % would have been predicted correctly (compared to 27.5510204081633 % if there was no knowledge about viewsCount quantile.)
lambda <- (e1_streams - e2_streams)/e1_streams
message(paste("Therefore, Goodmann and Kruskals Lambda is: ", lambda))
## Therefore, Goodmann and Kruskals Lambda is: 0.366197183098592
library("ryouready")
nom.lambda(tab2)
## Lambda:
## Columns dependent: 0.388
## Rows dependent: 0.366
## Symmetric: 0.377
ANOVA
fit <- aov(Artist_Popularity~Genre , data =x)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## Genre 3 31032 10344 110.3 <2e-16 ***
## Residuals 192 18006 94
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ks.test(x$Artist_Popularity[x$Genre == 'Classic'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Classic']) , sd=sd(x$Artist_Popularity[x$Genre == 'Classic']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Classic"], "pnorm", mean
## = mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
## D
## 0.06209392
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Classic']))
## [1] 0.1940143
ks.test(x$Artist_Popularity[x$Genre == 'Techno'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Techno']) , sd=sd(x$Artist_Popularity[x$Genre == 'Techno']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Techno"], "pnorm", mean
## = mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
## D
## 0.1108916
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Techno']))
## [1] 0.1980992
ks.test(x$Artist_Popularity[x$Genre == 'Hip Hop'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Hip Hop']) , sd=sd(x$Artist_Popularity[x$Genre == 'Hip Hop']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Hip Hop"], "pnorm", mean
## = mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
## D
## 0.09990045
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Hip Hop']))
## [1] 0.1920643
ks.test(x$Artist_Popularity[x$Genre == 'Pop'], "pnorm", mean = mean (x$Artist_Popularity[x$Genre == 'Pop']) , sd=sd(x$Artist_Popularity[x$Genre == 'Pop']))$statistic
## Warning in ks.test(x$Artist_Popularity[x$Genre == "Pop"], "pnorm", mean =
## mean(x$Artist_Popularity[x$Genre == : ties should not be present for the
## Kolmogorov-Smirnov test
## D
## 0.1191381
1.3581 / sqrt (length(x$Artist_Popularity[x$Genre == 'Pop']))
## [1] 0.1920643
leveneTest (Artist_Popularity~Genre, data =x)
varq <- tapply(x$Artist_Popularity, x$Genre, var, na.rm= TRUE)
varq/min(varq)
## Classic Hip Hop Pop Techno
## 1.522737 1.000000 2.293584 1.117978
PCA
audio_features <- c('acousticness' , 'danceability' ,'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence')
x_features <- dplyr::select(x, audio_features)
z <- scale(x_features)
eigen_cor <- eigen(cov(z))
E <- eigen_cor$vectors
pc_cor <- prcomp(z, center = F, scale = F)
plot(pc_cor, main="Scree plot as bar chart (cor)")
plot(pc_cor$sdev^2, type="b", main="Scree plot (cor)")
pc_index <- c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9")
eigen_values <- round(eigen_cor$values, digits = 2)
variance_explained <- round(eigen_cor$values/length(audio_features), digits = 2)
cum_variance <- round(cumsum(eigen_cor$values)/length(audio_features), digits = 2)
eigen_df <- data.frame(pc_index, eigen_values, variance_explained, cum_variance)
colnames(eigen_df) <- c("PC", "Eigen value", "Variance explained", "Cumulative variance explained")
as.data.frame(E)
E[,1] <- E[,1]*-1
E[,3] <- E[,3]*-1
E[,4] <- E[,4]*-1
E[,5] <- E[,5]*-1
E[,6] <- E[,6]*-1
as.data.frame(E)
scores <- z %*% E
library(corrplot)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
#par(mfrow=c(1,1))
#plot((pc_cor$sdev^2)/round(sum(pc_cor$sdev^2))*100, type="b", xlab = "Number of components", ylab = "Percentage of variance explained")
corrplot(pc_cor$rotation, method="color", col=col(200),
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=90, #Text label color and rotation
diag=TRUE, mar = c(0,0,5,5), title = 'PCA - loadings matrix')
plot(scores[,1], scores[,2])
plot(pc_cor$x[,1], pc_cor$x[,2]) # both plots are identical
library("psych")
psych_pca <- psych::principal(z, nfactors = 9, rotate = "none")
scree(z)
# psych_pca$scores[,1]*sqrt(psych_pca$values[1]) # == scores[,1]
# transform loadings of psych::principal to unit-length eigenvectors according to 1/sqrt(a^2 + b^2) * (a,b)
# 1/sqrt(sum(psych_pca$loadings[,1]^2))*psych_pca$loadings[,1] # is identical to pc_cor$rotation[,1] and E[,1]
# transforming unit-length eigenvectors to loadings like in psych::principal:
manual_loadings <- matrix(nrow = dim(z)[2], ncol = dim(z)[2])
for (i in 1:dim(z)[2]){
manual_loadings[,i] <- E[,i] * sqrt(eigen_cor$values[i])
}
# Still signs are interchanged, affected components are 3 and 9, change signs manually
manual_loadings[,3] <- manual_loadings[,3]*-1
manual_loadings[,9] <- manual_loadings[,9]*-1
# Scores in psych are computed according to
(z %*% E[,1]) / sqrt(eigen_cor$values[1]) # == psych_pca$scores[,1]
## [,1]
## 1 -1.59390372
## 2 -1.89244014
## 3 -1.56727583
## 4 -2.02403816
## 5 -1.34328173
## 6 -1.98652322
## 7 -1.65063718
## 8 0.23454238
## 9 0.33189660
## 10 -0.50527012
## 11 0.48474383
## 12 0.35548126
## 13 0.40438968
## 14 0.41389912
## 15 0.88932839
## 16 0.29545887
## 17 0.74966760
## 18 -0.33006285
## 19 0.42233800
## 20 0.60508283
## 21 0.35531678
## 22 1.08286743
## 23 0.33772362
## 24 0.39438136
## 25 0.38074682
## 26 0.53275627
## 27 0.35164160
## 28 0.89806447
## 29 0.34945558
## 30 0.79905861
## 31 0.64973543
## 32 0.79940254
## 33 -1.28729950
## 34 0.32285315
## 35 0.28938500
## 36 0.59057212
## 37 -0.91856390
## 38 0.54122847
## 39 0.77222028
## 40 -0.08719343
## 41 0.53776316
## 42 0.48802436
## 43 0.92409950
## 44 0.32211784
## 45 0.35080523
## 46 0.50370165
## 47 0.43417619
## 48 0.65562461
## 49 0.82640247
## 50 0.22221084
## 51 1.23872589
## 52 -0.27480596
## 53 -0.60837106
## 54 1.09983941
## 55 1.07217376
## 56 1.01232351
## 57 0.62184771
## 58 0.84979050
## 59 0.80091618
## 60 0.85471565
## 61 0.88031931
## 62 1.25090015
## 63 0.89078171
## 64 0.76487318
## 65 0.76159508
## 66 0.30109461
## 67 0.68030322
## 68 0.80407226
## 69 0.72002543
## 70 1.15107907
## 71 0.40045381
## 72 1.02252137
## 73 0.48945386
## 74 0.89961735
## 75 0.58542734
## 76 0.99733632
## 77 0.79627998
## 78 0.98885771
## 79 1.28630203
## 80 0.46651096
## 81 1.33907715
## 82 0.64998223
## 83 0.55201140
## 84 0.75942752
## 85 0.53261883
## 86 0.74383810
## 87 0.74165546
## 88 0.74866137
## 89 0.96277142
## 90 0.62715735
## 91 0.64793200
## 92 0.95334623
## 93 0.44942747
## 94 0.69025775
## 95 0.97104873
## 96 1.18362619
## 97 0.59019519
## 98 -0.39117191
## 99 0.40327243
## 100 0.99054825
## 101 0.86270002
## 102 0.67556803
## 103 1.01390165
## 104 1.36233335
## 105 0.81443559
## 106 1.12486562
## 107 0.70258335
## 108 0.10632817
## 109 0.99193014
## 110 0.75504621
## 111 0.65676336
## 112 0.33047902
## 113 0.95870755
## 114 0.75603911
## 115 -2.11696526
## 116 -1.74364531
## 117 -1.53213316
## 118 -1.94291687
## 119 -1.72239513
## 120 -1.36619922
## 121 -1.66305915
## 122 -2.06411957
## 123 -1.97969420
## 124 -1.73865879
## 125 -1.59092573
## 126 -1.75704042
## 127 -1.65120933
## 128 -0.90738230
## 129 -1.57915022
## 130 -1.79710843
## 131 -0.72000536
## 132 -1.11982385
## 133 -1.16126131
## 134 -1.26343254
## 135 -1.59056965
## 136 -1.19417247
## 137 -1.92762287
## 138 -1.75542132
## 139 -2.11166874
## 140 -1.75851617
## 141 -1.30171171
## 142 -1.64499033
## 143 -0.70384029
## 145 -2.22479894
## 146 -0.43936157
## 147 -1.98892924
## 148 -2.10319657
## 149 0.35090994
## 150 0.35960014
## 152 0.27783403
## 153 0.23520099
## 154 0.42816581
## 155 0.29415102
## 156 0.28883891
## 157 0.28901016
## 158 0.32561591
## 159 0.33618271
## 160 0.53796886
## 161 0.15721183
## 162 0.48276945
## 163 0.18837789
## 165 0.38861944
## 166 0.15693865
## 167 0.34641160
## 168 0.46655158
## 169 0.33666927
## 170 0.23963328
## 171 0.37607131
## 172 0.51586048
## 173 -0.01126555
## 174 0.40426506
## 175 0.23144730
## 176 0.40281787
## 177 0.20758467
## 178 0.24328905
## 179 0.06504826
## 180 0.25132125
## 181 0.18718854
## 183 0.39566848
## 184 0.49874147
## 185 0.39761257
## 186 0.59112779
## 187 0.52223745
## 188 0.34826820
## 189 0.17079406
## 190 0.22460558
## 191 0.18252664
## 192 -1.69078006
## 193 -1.32535523
## 194 -1.35844177
## 195 -1.97935120
## 196 -1.43855432
## 197 -1.63893155
## 198 -0.83876042
## 199 -1.32071392
## 200 -1.71405630
# i.e. using the unit-length eigenvectors and dividing them by the square root of the corresponding eigenvalue
par(mfrow=c(1,2))
biplot(pc_cor) # loadings plot with loadings scaled to fit in same graph with scores
plot(pc_cor$rotation[, 1:2]) # loadings in original measures
text(pc_cor$rotation[, 1:2], as.character(colnames(z)), pos = 1, cex = 0.7, offset = 0.1)
#palette <- rainbow(length(levels(as.factor(x$Genre))))
#my_colors <- palette[as.numeric(x$Genre)]
palette <- c('red', 'blue', 'black', 'purple')
# palette <- rainbow(length(levels(as.factor(x$Genre))))
my_colors <- palette[as.numeric(x$Genre)]
par(mfrow=c(1,1))
plot(scores[, 1:2], col = my_colors, pch = 19, xlab = "Scores PC1 (47.84% expl. variance)", ylab = "Scores PC2 (16.25% expl. variance)")
legend("topleft", legend = levels(as.factor(x$Genre)), col = palette, pch = 16, horiz = T)
#plot(pc_cor$rotation[, 1:2])
par(mfrow=c(1,1))
plot(E[, 1:2])
text(E[, 1:2], as.character(colnames(z)), pos = 1, cex = 0.7, offset = 0.1)
cumsum(eigen_cor$values)/length(audio_features)
## [1] 0.4783993 0.6408610 0.7496500 0.8399854 0.9171536 0.9507038 0.9754113
## [8] 0.9910233 1.0000000
# According to the 90% criterion there would be 5 components needed to appropriately retain the variance in the data by reducing the feature space from nine to five.
RMSE = function(m, o){
sqrt(mean((m - o)^2))
}
# All expressions for the scores are identical
scores <- z %*% E
scores_from_psych <- (z %*% (1/sqrt(sum(psych_pca$loadings[,1]^2))*psych_pca$loadings[,1])) / sqrt(psych_pca$values[1]) # == psych_pca$scores[,1]
scores_from_princomp <- (z %*% pc_cor$rotation[,1]) / sqrt(eigen_cor$values[1])
fits <- matrix(nrow = dim(z)[2], ncol = 3)
for (i in 1:dim(z)[2]){
num_comp <- i
fit <- scores[, 1:num_comp] %*% t(E[, 1:num_comp])
residuals <- z - fit
rmse <- RMSE(z, fit)
r2 <- 1-(sum(diag(var(residuals)))/ sum(diag(var(z))))
fits[i,1] <- num_comp
fits[i,2] <- rmse
fits[i,3] <- r2
}
par(mfrow=c(1,1))
plot(fits[,1], fits[,2], type = "l", ylab = 'RMSE', xlab = 'Number of components')
par(new = TRUE)
plot(fits[,1], fits[,3], type = "l", axes = FALSE, bty = "n", xlab = "", ylab = "")
axis(side=4, at = pretty(range(fits[,3])))
mtext("R^2", side = 4, line = -1)
abline(v = 2, lty = 2, col = 'green')
mtext("Kaiser criterion", side = 1, line = 2, adj = 0.1)
abline(v=min(which(fits[,3]>=0.9)), lty = 2, col = 'red')
mtext("90% rule", side = 1, line = 1, adj = 0.5)
# Squared prediction errors / Q-residuals
par(mfrow=c(1,1))
a <- 1
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
## [1] 0.7203745
res <- z - Xhat
E2 <- res * res
SPE_1 <- apply(E2, 1, sum) # not using sqrt(apply(E2, 1, sum))!
# Box (1954) method: weighted Chi-squared
v <- var(SPE_1)
m <- mean(SPE_1)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k1_95 <- g*qchisq(.95, df=h)
plot(SPE_1, ylab = paste('SPE after using', a, 'component'), xlab = 'index')
abline(h = lim_k1_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_1)), digits = 4)), side=1, line=3.5, at=9)
sqrt(mean(SPE_1))
## [1] 2.161124
a <- 2
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
## [1] 0.5977514
res <- z - Xhat
E2 <- res * res
#SPE_2 <- sqrt(apply(E2, 1, sum))
SPE_2 <- apply(E2, 1, sum)
# pca(z, ncomp = a, center = FALSE, scale = FALSE, alpha = 0.05)$Qlim[1]
v <- var(SPE_2)
m <- mean(SPE_2)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k2_95 <- g*qchisq(.95, df=h)
plot(SPE_2, ylab = paste('SPE after using', a, 'components'), xlab = 'index')
abline(h = lim_k2_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_2)), digits = 4)), side=1, line=3.5, at=9)
sqrt(mean(SPE_2))
## [1] 1.793254
a <- 9
Xhat <- pc_cor$x[,seq(1,a)] %*% t(pc_cor$rotation[,seq(1,a)])
RMSE(z, Xhat)
## [1] 9.521473e-16
res <- z - Xhat
E2 <- res * res
# SPE_9 <- sqrt(apply(E2, 1, sum))
SPE_9 <- apply(E2, 1, sum)
# pca(z, ncomp = a, center = FALSE, scale = FALSE, alpha = 0.05)$Qlim[1]
v <- var(SPE_9)
m <- mean(SPE_9)
h <- (2*m^2)/v
g <- v / (2*m)
lim_k9_95 <- g*qchisq(.95, df=h)
plot(SPE_9, ylab = paste('SPE after using', a, 'components'), xlab = 'index')
abline(h = lim_k9_95, lty = 2, col = 'darkgreen')
mtext(paste('RMSE =', round(sqrt(mean(SPE_9)), digits = 4)), side=1, line=3.5, at=9)
sqrt(mean(SPE_9))
## [1] 2.856442e-15
par(mfrow=c(3,1))
plot(SPE_1, type = 'l', col = 'darkgreen', ylab = "SPE for PC1")
abline(h = lim_k1_95, lty = 2, col = 'darkgreen')
plot(SPE_2, type = 'l', col = 'red', ylab = "SPE for PC1 + PC2")
abline(h = lim_k2_95, lty = 2, col = 'darkgreen')
plot(SPE_9, type = 'l', col = 'blue', ylab = "SPE for all PCs")
abline(h = lim_k9_95, lty = 2, col = 'darkgreen')
sum(SPE_1 > lim_k1_95)
## [1] 6
sum(SPE_1 > lim_k1_95)/length(SPE_1)
## [1] 0.03061224
sum(SPE_2 > lim_k2_95)
## [1] 7
sum(SPE_2 > lim_k2_95)/length(SPE_2)
## [1] 0.03571429
sum(SPE_9 > lim_k9_95)
## [1] 10
sum(SPE_9 > lim_k9_95)/length(SPE_9)
## [1] 0.05102041
Hotelling’s T^2
num_comp <- 2
inverse_cov <- diag(eigen_cor$values[1:num_comp], nrow = num_comp, ncol = num_comp)^-1
inverse_cov[is.infinite(inverse_cov)] <- 0
tsquared <- diag(z %*% E[,1:num_comp] %*% inverse_cov %*% t(z %*% E[,1:num_comp]))
# which is equivalent to the Mahalanobis distance: (for k > 1)
tsquared_mahal <- mahalanobis(scores[,1:num_comp], center = FALSE, cov(cbind(scores[,1], scores[,num_comp])), inverted = FALSE)
tsquared_lim95 <- (((dim(z)[1] - 1) * (dim(z)[1] + 1) * num_comp) / (dim(z)[1] * (dim(z)[1] - num_comp))) * qf(p =.95, df1 = num_comp, df2 = dim(z)[1] - num_comp)
tsquared_lim99 <- (((dim(z)[1] - 1) * (dim(z)[1] + 1) * num_comp) / (dim(z)[1] * (dim(z)[1] - num_comp))) * qf(p =.99, df1 = num_comp, df2 = dim(z)[1] - num_comp)
par(mfrow=c(1,1))
plot(tsquared, type = 'l', ylim = c(0,tsquared_lim99*1.1), ylab = "Hotelling's T2")
abline(h=tsquared_lim99, col = 'red', lty = 2)
text(190,10, "99% limit", col = "red")
abline(h=tsquared_lim95, col = 'darkgreen', lty = 2)
text(190,6.6, "95% limit", col = "darkgreen")
# source: https://github.com/hredestig/pcaMethods/blob/master/R/pca.R
simpleEllipse <- function(x, y, alfa=0.95, len=200) {
N <- length(x)
A <- 2
mypi <- seq(0, 2 * pi, length=len)
r1 <- sqrt(var(x) * qf(alfa, 2, N - 2) * (2*(N^2 - 1)/(N * (N - 2))))
r2 <- sqrt(var(y) * qf(alfa, 2, N - 2) * (2*(N^2 - 1)/(N * (N - 2))))
cbind(r1 * cos(mypi) + mean(x), r2 * sin(mypi) + mean(y))
}
confidence_ellipse95 <- simpleEllipse(scores[,1], scores[,2], alfa = 0.95, len=500)
confidence_ellipse99 <- simpleEllipse(scores[,1], scores[,2], alfa = 0.99, len=500)
plot(scores[,1], scores[,2], xlim = c(min(confidence_ellipse99[,1]), max(confidence_ellipse99[,1])),
ylim = c(min(confidence_ellipse99[,2]), max(confidence_ellipse99[,2])), ylab = 'Scores PC2', xlab = 'Scores PC1')
abline(h = 0, v = 0)
points(confidence_ellipse95, type = 'l', lty = 2, col = 'darkgreen')
points(confidence_ellipse99, type = 'l', lty = 2, col = 'red')
x[as.numeric(names(tsquared[tsquared > tsquared_lim95])),c('Track_Title', 'Track_Artist', 'Genre')]
outlier_tracks <- x[as.numeric(names(tsquared[tsquared > tsquared_lim95])),c('Track_Title', 'Track_Artist', 'Genre')]
round(z[as.numeric(names(tsquared[tsquared > tsquared_lim95])),], digits = 2)
## acousticness danceability energy instrumentalness liveness loudness
## 21 0.88 0.21 -0.51 -0.90 -0.23 -0.04
## 146 0.35 0.49 -0.79 1.26 2.13 -0.20
## speechiness tempo valence
## 21 4.30 -0.95 1.08
## 146 -0.61 -0.41 -0.68
audio_z <- as.data.frame(z[as.numeric(names(tsquared[tsquared > tsquared_lim95])),])
outlier_tracks$acousticness <- audio_z$acousticness
outlier_tracks$danceability <- audio_z$danceability
outlier_tracks$energy <- audio_z$energy
outlier_tracks$instrumentalness <- audio_z$instrumentalness
outlier_tracks$liveness <- audio_z$liveness
outlier_tracks$loudness <- audio_z$loudness
outlier_tracks$speechiness <- audio_z$speechiness
outlier_tracks$tempo <- audio_z$tempo
outlier_tracks$valence <- audio_z$valence
# Finally, combining residuals and scores (i.e. Hotelling's T^2)
# cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp]
plot(tsquared, SPE_2,
xlab = paste("Hotelling's T2 (", round(cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp]*100, digits = 2), "%)"),
ylab = paste("Q-residuals (", round((1-cumsum(pc_cor$sdev^2 / sum(pc_cor$sdev^2))[num_comp])*100, digits = 2), "%)"))
abline(h = lim_k2_95, lty = 2, col = 'darkgreen') # seven residual outliers
abline(v = tsquared_lim95, lty = 2, col = 'darkgreen') # two score outliers
library("robustbase")
out <- adjOutlyingness(z, ndir=5000, clower=0, cupper=0)
hist(out$adjout)
rug(out$adjout)
z[!out$nonOut,]# as we can confirm, the outlying cases differ strongly in their characteristics from their respective centers, that is especially liveness and energy are inconsistent with the model
## acousticness danceability energy instrumentalness liveness loudness
## 15 -0.0944678 0.3050881 0.4781728 -0.9049362 4.3284375 0.5324131
## 21 0.8823223 0.2129273 -0.5076300 -0.9049323 -0.2300724 -0.0352499
## 51 -0.7253561 0.2925207 1.1679094 -0.9049362 7.1823764 0.8712256
## 96 -0.5613949 1.5073675 0.8197876 -0.9042543 4.8251982 0.5040817
## 140 1.5880799 -1.8187992 -1.6889666 0.5378426 3.1693292 -1.7704512
## 145 1.4190201 -1.7098819 -1.8773428 -0.9048587 -0.3469572 -4.0775846
## speechiness tempo valence
## 15 -0.3911434 1.1156910 1.3435281
## 21 4.3016000 -0.9477023 1.0815371
## 51 -0.5411446 -0.4296090 1.6680842
## 96 1.4994957 -0.0694509 0.1156894
## 140 -0.5848949 -1.9895640 -1.0077438
## 145 -0.6432287 -1.6341112 -1.1700219
x[rownames(z[!out$nonOut,]),c("Track_Title", "Track_Artist", "Genre", "Track_Popularity")]
library("paran")
paran(z, centile=95, all=T, graph=T)
##
## Using eigendecomposition of correlation matrix.
## Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
##
##
## Results of Horn's Parallel Analysis for component retention
## 270 iterations, using the 95 centile estimate
##
## --------------------------------------------------
## Component Adjusted Unadjusted Estimated
## Eigenvalue Eigenvalue Bias
## --------------------------------------------------
## 1 3.866214 4.305594 0.439379
## 2 1.164901 1.462154 0.297253
## 3 0.783579 0.979100 0.195521
## 4 0.698837 0.813018 0.114181
## 5 0.656256 0.694514 0.038258
## 6 0.331472 0.301951 -0.02952
## 7 0.315614 0.222367 -0.09324
## 8 0.306746 0.140507 -0.16623
## 9 0.319864 0.080790 -0.23907
## --------------------------------------------------
##
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (2 components retained)
x$pc_1 <- pc_cor$x[,1]
x$pc_2 <- pc_cor$x[,2]
x$Release_Date <- as.Date(paste(x$Release_Date, 1, 1, sep = "-"))
x$days_release_orig <- as.integer(round(difftime('2020-03-01', x$Release_Date, units = "days"), digits = 0))
x$Release_Date <- NULL
x$days_release <- as.numeric(scale(x$days_release_orig))
drop.cols <- c('Artist_ID', 'Genre', 'Track_Artist','Track_ID', 'Track_Title', 'key',
'mode', 'time_signature', 'video_ID', 'Charts', 'acousticness', 'danceability', 'energy', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'Artist_Popularity', 'Track_Popularity', 'Artist_Popularity_quantile', 'viewsCount_quantile', 'days_release_orig', 'pc_1', 'pc_2')
x_sel <- dplyr::select(x, -one_of(drop.cols))
complete_index <- as.numeric(rownames(x_sel))
z <- scale(x_sel)
# inverse and partial correlations
p <- solve(cor(z, use="complete.obs"))
# if the model holds then the non-diagonal elements of Rô€€€1
# must be close to zero (relative to the diagonal element)
levelplot(p, scales=list(x=list(rot=90)))
# Partial correlations
pr <- -p/sqrt(outer(diag(p), diag(p)))
levelplot(pr, scales=list(x=list(rot=90)))
KMO(z)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = z)
## Overall MSA = 0.74
## MSA for each item =
## Artist_Albums_Number Artist_Albums_Tracks_Number
## 0.81 0.70
## Artist_Appearances_Number Artist_Appearances_Tracks_Number
## 0.56 0.53
## Artist_Compilations_Number Artist_Compilations_Tracks_Number
## 0.71 0.69
## Artist_Follower Artist_Singles_Number
## 0.71 0.78
## Artist_Singles_Tracks_Number Track_Duration_ms
## 0.60 0.70
## commentCount dislikeCount
## 0.85 0.84
## likeCount viewsCount
## 0.80 0.85
## days_release
## 0.65
# result: 0.74 overall: middling, every individual MSA value is as least as high as 0.53 (middling)
cortest.bartlett(z) # result: correlation matrix is NOT an identity matrix, so proceed with factor analysis
## R was not square, finding R from data
## $chisq
## [1] 3167.088
##
## $p.value
## [1] 0
##
## $df
## [1] 105
scree(z)
library("paran")
paran(z, centile=95, all=T, graph=T) # four factors are appropriate
##
## Using eigendecomposition of correlation matrix.
## Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
##
##
## Results of Horn's Parallel Analysis for component retention
## 450 iterations, using the 95 centile estimate
##
## --------------------------------------------------
## Component Adjusted Unadjusted Estimated
## Eigenvalue Eigenvalue Bias
## --------------------------------------------------
## 1 3.999864 4.590215 0.590350
## 2 3.297602 3.761880 0.464278
## 3 1.660186 2.026943 0.366756
## 4 1.051611 1.333118 0.281506
## 5 0.747370 0.955038 0.207667
## 6 0.510380 0.649312 0.138931
## 7 0.518990 0.598370 0.079379
## 8 0.353892 0.377677 0.023785
## 9 0.271750 0.245019 -0.02673
## 10 0.262400 0.177595 -0.08480
## 11 0.243843 0.106222 -0.13762
## 12 0.286497 0.096959 -0.18953
## 13 0.273755 0.036315 -0.23743
## 14 0.320999 0.025889 -0.29511
## 15 0.375884 0.019441 -0.35644
## --------------------------------------------------
##
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (4 components retained)
Loadings
pca <- principal(z, nfactors=4, rotate="none") # h2 are "communalities"
pa <- fa(z, nfactors=4, rotate="none", fm="pa") # principal axis extraction
uls <- fa(z, nfactors=4, rotate="none")
ml <- fa(z, nfactors=4, rotate="none", fm="ml")
library("xtable")
# xtable(unclass(ml$loadings))
print(ml$loadings, sort = T)
##
## Loadings:
## ML2 ML3 ML1 ML4
## commentCount 0.965 -0.144
## dislikeCount 0.968 -0.161
## likeCount 0.980 -0.145
## viewsCount 0.943 -0.108
## Artist_Albums_Number 0.750 0.503
## Artist_Albums_Tracks_Number 0.638 0.232 0.587
## Artist_Compilations_Number 0.876 0.365
## Artist_Compilations_Tracks_Number 0.896 0.408
## Artist_Singles_Number 0.358 0.685 -0.206
## Artist_Singles_Tracks_Number -0.105 0.988
## Artist_Appearances_Number 0.212 0.870
## Artist_Appearances_Tracks_Number 0.231 0.551
## Artist_Follower 0.335
## Track_Duration_ms 0.268 0.375
## days_release 0.384 0.106 0.392
##
## ML2 ML3 ML1 ML4
## SS loadings 3.879 2.849 2.315 1.756
## Proportion Var 0.259 0.190 0.154 0.117
## Cumulative Var 0.259 0.449 0.603 0.720
print(ml$loadings, cutoff=.4, sort = T)
##
## Loadings:
## ML2 ML3 ML1 ML4
## commentCount 0.965
## dislikeCount 0.968
## likeCount 0.980
## viewsCount 0.943
## Artist_Albums_Number 0.750 0.503
## Artist_Albums_Tracks_Number 0.638 0.587
## Artist_Compilations_Number 0.876
## Artist_Compilations_Tracks_Number 0.896 0.408
## Artist_Singles_Number 0.685
## Artist_Singles_Tracks_Number 0.988
## Artist_Appearances_Number 0.870
## Artist_Appearances_Tracks_Number 0.551
## Artist_Follower
## Track_Duration_ms
## days_release
##
## ML2 ML3 ML1 ML4
## SS loadings 3.879 2.849 2.315 1.756
## Proportion Var 0.259 0.190 0.154 0.117
## Cumulative Var 0.259 0.449 0.603 0.720
set.seed(42)
ml.varimax <- fa(z, nfactors=4, rotate="varimax", fm="ml")
print(ml.varimax$loadings, cutoff=.4, sort = T)
##
## Loadings:
## ML2 ML3 ML4 ML1
## commentCount 0.974
## dislikeCount 0.979
## likeCount 0.989
## viewsCount 0.950
## Artist_Albums_Number 0.857
## Artist_Compilations_Number 0.948
## Artist_Compilations_Tracks_Number 0.976
## Artist_Albums_Tracks_Number 0.619 0.643
## Artist_Appearances_Number 0.898
## Artist_Appearances_Tracks_Number 0.587
## Artist_Singles_Number 0.555 0.579
## Artist_Singles_Tracks_Number 0.955
## Artist_Follower
## Track_Duration_ms 0.430
## days_release 0.417
##
## ML2 ML3 ML4 ML1
## SS loadings 3.922 3.457 2.015 1.404
## Proportion Var 0.261 0.230 0.134 0.094
## Cumulative Var 0.261 0.492 0.626 0.720
# xtable(unclass(ml.varimax$loadings))
fa.congruence(ml, ml.varimax)
## ML2 ML3 ML4 ML1
## ML2 1.00 -0.07 -0.09 -0.01
## ML3 -0.01 0.98 0.33 0.17
## ML1 -0.20 0.69 0.45 0.89
## ML4 -0.03 0.14 0.96 -0.09
par(mfrow=c(1,1))
threshold <- 0.4
plot(ml.varimax$loadings[,1], ml.varimax$loadings[,2], xlim = c(-1, 1), ylim = c(-1,1.1), xlab = 'ML2', ylab = 'ML3')
palette <- c('red', 'blue')
col_index <- ifelse((abs(ml.varimax$loadings[,1]) > threshold) | (abs(ml.varimax$loadings[,2]) > threshold), 1, 0)
cols <- palette[as.numeric(as.factor(col_index))]
points(ml.varimax$loadings[,1], ml.varimax$loadings[,2], pch=19, col=cols)
abline(h = 0, v= 0)
text(ml.varimax$loadings[,1:2], as.character(rownames(ml.varimax$loadings)), pos = 1, cex = 0.6, offset = 0.5)
rect(xleft = -threshold, ybottom = -threshold, xright = threshold, ytop = threshold)
# fa.diagram(ml.varimax, simple=TRUE, cut=.2, digits=2)
set.seed(42)
ml.promax <- fa(z, nfactors=4, rotate="promax", fm="ml")
## Loading required namespace: GPArotation
fa.congruence(ml.promax, ml.varimax)
## ML2 ML3 ML4 ML1
## ML2 1.00 -0.01 -0.03 -0.08
## ML3 -0.01 0.99 0.19 0.29
## ML4 0.00 0.09 0.97 0.10
## ML1 -0.09 0.24 0.12 0.99
ml.promax$Phi
## ML2 ML3 ML4 ML1
## ML2 1.00000000 -0.1234826 -0.1397866 0.07121945
## ML3 -0.12348257 1.0000000 0.3651684 0.16587388
## ML4 -0.13978664 0.3651684 1.0000000 -0.00887250
## ML1 0.07121945 0.1658739 -0.0088725 1.00000000
print(ml.promax$loadings, cutoff=.4, sort = T)
##
## Loadings:
## ML2 ML3 ML4 ML1
## commentCount 0.984
## dislikeCount 0.990
## likeCount 0.999
## viewsCount 0.962
## Artist_Albums_Number 0.860
## Artist_Albums_Tracks_Number 0.552 0.545
## Artist_Compilations_Number 0.999
## Artist_Compilations_Tracks_Number 1.018
## Artist_Appearances_Number 0.940
## Artist_Appearances_Tracks_Number 0.643
## Artist_Singles_Number 0.535 0.541
## Artist_Singles_Tracks_Number 0.956
## Artist_Follower
## Track_Duration_ms 0.436
## days_release
##
## ML2 ML3 ML4 ML1
## SS loadings 3.989 3.519 2.033 1.346
## Proportion Var 0.266 0.235 0.136 0.090
## Cumulative Var 0.266 0.501 0.636 0.726
#xtable(unclass(ml.promax$loadings))
print(ml.promax$Structure, cutoff = 0.4)
##
## Loadings:
## ML2 ML3 ML4 ML1
## Artist_Albums_Number 0.889
## Artist_Albums_Tracks_Number 0.721 0.744
## Artist_Appearances_Number 0.891
## Artist_Appearances_Tracks_Number 0.565
## Artist_Compilations_Number 0.941
## Artist_Compilations_Tracks_Number 0.980
## Artist_Follower
## Artist_Singles_Number 0.582 0.632
## Artist_Singles_Tracks_Number 0.958
## Track_Duration_ms 0.449
## commentCount 0.974
## dislikeCount 0.979
## likeCount 0.989
## viewsCount 0.947
## days_release 0.426 0.477
##
## ML2 ML3 ML4 ML1
## SS loadings 3.969 3.890 2.427 1.548
## Proportion Var 0.265 0.259 0.162 0.103
## Cumulative Var 0.265 0.524 0.686 0.789
# For orthoghonal rotations: cor(item, factor) = loadings ("pattern") matrix = structure matrix,
# since structure matrix = loadings matrix *%* factor intercorrelation matrix (which is identity for orthogonal rotations)
# example: ml.promax$loadings %*% ml.promax$Phi = structure matrix
par(mfrow=c(1,1))
threshold <- 0.5
plot(ml.promax$Structure[,1], ml.promax$Structure[,2], xlim = c(-1, 1), ylim = c(-1,1.1), xlab = 'ML2', ylab = 'ML3')
palette <- c('red', 'blue')
col_index <- ifelse((abs(ml.promax$Structure[,1]) > threshold) | (abs(ml.promax$Structure[,2]) > threshold), 1, 0)
cols <- palette[as.numeric(as.factor(col_index))]
points(ml.promax$Structure[,1], ml.promax$Structure[,2], pch=19, col=cols)
abline(h = 0, v= 0)
text(ml.promax$Structure[,1:2], as.character(rownames(ml.promax$Structure)), pos = 1, cex = 0.6, offset = 0.5)
rect(xleft = -threshold, ybottom = -threshold, xright = threshold, ytop = threshold)
fa1 <-factanal(z, factors=4, scores = 'regression', lower = 0.1) # ML with Kaiser normalization
head(fa1$scores)
## Factor1 Factor2 Factor3 Factor4
## 1 -0.09032514 4.7229399 -1.1982826 1.1913023
## 2 -0.06532926 1.9592530 0.6796187 1.9294387
## 3 -0.17155114 1.2161697 2.5043243 -0.6176557
## 4 -0.14220859 4.8213406 -1.0227099 0.2499034
## 5 -0.15666866 6.0158376 -1.7644434 -0.1556914
## 6 -0.16905143 0.1190256 2.0571621 -0.1523273
fa2 <- fa(z, nfactors=4) # oblimin rotation without Kaiser normalization
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
head(fa2$scores)
## MR2 MR1 MR3 MR4
## 1 -0.1257348 4.9076402 -1.4064326 3.2185555
## 2 -0.2186062 2.3024182 0.7123318 2.1407078
## 3 -0.2078290 1.3247000 2.3948606 -0.3356255
## 4 -0.1849004 4.2448612 0.0405285 1.8486009
## 5 -0.2345250 5.4651211 -0.7047564 1.0788027
## 6 -0.2682063 0.3496226 1.8536951 0.1795223
cor(fa1$scores, fa2$scores)
## MR2 MR1 MR3 MR4
## Factor1 0.99066727 -0.04263156 -0.03183455 0.02879820
## Factor2 -0.02300522 0.98601395 0.12472840 0.27124108
## Factor3 -0.03546517 0.11316250 0.97914604 0.09737382
## Factor4 -0.05458080 0.13298949 -0.03618965 0.93878256
cor(fa1$scores, ml.promax$scores)
## ML2 ML3 ML4 ML1
## Factor1 0.995994444 -0.06746742 -0.08311111 0.060236052
## Factor2 -0.051288886 0.97618626 0.22252324 0.086347208
## Factor3 -0.056169205 0.17796182 0.97556026 0.002189822
## Factor4 0.007625069 0.11489683 0.02445173 0.989366259
cor(ml.promax$scores, ml.varimax$scores)
## ML2 ML3 ML4 ML1
## ML2 0.99714527 -0.05043569 -0.058887859 0.01154380
## ML3 -0.06617328 0.97785070 0.193263920 0.08735674
## ML4 -0.08094263 0.19810346 0.979273972 0.03916156
## ML1 0.06253243 0.10010835 -0.004175889 0.99271103
fa_bartlett_scores <- fa(z, nfactors=4, rotate="varimax", fm="ml", scores = 'Bartlett')
cor(ml.varimax$scores, fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## ML2 9.999945e-01 -1.574145e-18 8.383016e-17 -1.545713e-17
## ML3 4.392591e-17 9.999300e-01 7.699237e-16 -1.960363e-16
## ML4 3.900769e-18 7.222804e-19 9.995550e-01 3.371781e-17
## ML1 -5.920793e-18 -3.169430e-16 5.198842e-16 9.996269e-01
cor(fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## ML2 1.0000000000 0.0003036624 0.003257071 -0.0006236104
## ML3 0.0003036624 1.0000000000 -0.011669369 -0.0016210978
## ML4 0.0032570710 -0.0116693685 1.000000000 -0.0272409055
## ML1 -0.0006236104 -0.0016210978 -0.027240905 1.0000000000
# sum scores
threshold <- 0.5
vars_1 <- abs(ml.promax$Structure[,1])>threshold
vars_2 <- abs(ml.promax$Structure[,2])>threshold
vars_3 <- abs(ml.promax$Structure[,3])>threshold
vars_4 <- abs(ml.promax$Structure[,4])>threshold
key.list <- list(one = as.numeric(which(vars_1==1)),
two = as.numeric(which(vars_2==1)),
three = as.numeric(which(vars_3==1)),
four = as.numeric(which(vars_4==1)))
sign.mat <- cbind(sign(ml.promax$Structure[,1]),
sign(ml.promax$Structure[,2]),
sign(ml.promax$Structure[,3]),
sign(ml.promax$Structure[,4]))
keys <- make.keys(z, key.list,item.labels = colnames(z))
si <- scoreItems(keys * sign.mat, z) # these are the scores I finally use
pairs(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]))
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]))
## [,1] [,2] [,3] [,4]
## [1,] 1.00000000 -0.07190494 -0.04542303 -0.04038714
## [2,] -0.07190494 1.00000000 0.42361028 0.62898318
## [3,] -0.04542303 0.42361028 1.00000000 0.19564730
## [4,] -0.04038714 0.62898318 0.19564730 1.00000000
print(ml.promax$loadings, cutoff = 0.5, sort = T)
##
## Loadings:
## ML2 ML3 ML4 ML1
## commentCount 0.984
## dislikeCount 0.990
## likeCount 0.999
## viewsCount 0.962
## Artist_Albums_Number 0.860
## Artist_Albums_Tracks_Number 0.552 0.545
## Artist_Compilations_Number 0.999
## Artist_Compilations_Tracks_Number 1.018
## Artist_Appearances_Number 0.940
## Artist_Appearances_Tracks_Number 0.643
## Artist_Singles_Number 0.535 0.541
## Artist_Singles_Tracks_Number 0.956
## Artist_Follower
## Track_Duration_ms
## days_release
##
## ML2 ML3 ML4 ML1
## SS loadings 3.989 3.519 2.033 1.346
## Proportion Var 0.266 0.235 0.136 0.090
## Cumulative Var 0.266 0.501 0.636 0.726
# I apply a threshold of 0.5 for manual computation of sum scores
f2 <- (z[,11] + z[,12] + z[,13] + z[,14])/4 # commentCount, dislikeCount, likeCount, viewsCount
f3 <- (z[,1] + z[,2] + z[,5])/3 # Artist_Albums_Number, Artist_Compilations_Number, Artist_Compilations_Tracks_Number
f4 <- (z[,2] + z[,3] + z[,4])/3 # Artist_Albums_Tracks_Number, Artist_Appearances_Number, Artist_Appearances_Tracks_Number
f1 <- (z[,8] + z[,9])/2 # Artist_Singles_Number, Artist_Singles_Tracks_Number
pairs(cbind(f2, f3, f4, f1))
cor(cbind(f2, f3, f4, f1))
## f2 f3 f4 f1
## f2 1.00000000 -0.08246012 -0.04542303 -0.04038714
## f3 -0.08246012 1.00000000 0.51683413 0.47961885
## f4 -0.04542303 0.51683413 1.00000000 0.19564730
## f1 -0.04038714 0.47961885 0.19564730 1.00000000
psych::alpha(cor(z[,vars_1]), check.keys = T) # commentCount, dislikeCount, likeCount, viewsCount
##
## Reliability analysis
## Call: psych::alpha(x = cor(z[, vars_1]), check.keys = T)
##
## raw_alpha std.alpha G6(smc) average_r S/N median_r
## 0.99 0.99 0.99 0.95 74 0.96
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N var.r med.r
## commentCount 0.98 0.98 0.98 0.95 55 7.2e-04 0.95
## dislikeCount 0.98 0.98 0.98 0.94 49 9.3e-04 0.95
## likeCount 0.98 0.98 0.97 0.93 42 1.2e-03 0.92
## viewsCount 0.99 0.99 0.98 0.97 96 1.5e-05 0.97
##
## Item statistics
## r r.cor r.drop
## commentCount 0.98 0.97 0.97
## dislikeCount 0.98 0.98 0.97
## likeCount 0.99 0.99 0.99
## viewsCount 0.96 0.95 0.94
psych::alpha(cor(z[,vars_2]), check.keys = T) # Artist_Albums_Number, Artist_Albums_Tracks_Number, Artist_Compilations_Number, Artist_Compilations_Tracks_Number, Artist_Singles_Number
##
## Reliability analysis
## Call: psych::alpha(x = cor(z[, vars_2]), check.keys = T)
##
## raw_alpha std.alpha G6(smc) average_r S/N median_r
## 0.91 0.91 0.92 0.66 9.7 0.63
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N
## Artist_Albums_Number 0.86 0.86 0.88 0.60 6.1
## Artist_Albums_Tracks_Number 0.92 0.92 0.92 0.75 11.7
## Artist_Compilations_Number 0.86 0.86 0.87 0.61 6.2
## Artist_Compilations_Tracks_Number 0.85 0.85 0.85 0.59 5.7
## Artist_Singles_Number 0.92 0.92 0.93 0.75 12.1
## var.r med.r
## Artist_Albums_Number 0.047 0.60
## Artist_Albums_Tracks_Number 0.024 0.72
## Artist_Compilations_Number 0.039 0.63
## Artist_Compilations_Tracks_Number 0.034 0.61
## Artist_Singles_Number 0.023 0.74
##
## Item statistics
## r r.cor r.drop
## Artist_Albums_Number 0.93 0.92 0.89
## Artist_Albums_Tracks_Number 0.73 0.64 0.59
## Artist_Compilations_Number 0.93 0.94 0.88
## Artist_Compilations_Tracks_Number 0.95 0.97 0.92
## Artist_Singles_Number 0.72 0.62 0.58
psych::alpha(cor(z[,vars_3]), check.keys = T) # Artist_Albums_Tracks_Number, Artist_Appearances_Number, Artist_Appearances_Tracks_Number, Track_Duration_ms
##
## Reliability analysis
## Call: psych::alpha(x = cor(z[, vars_3]), check.keys = T)
##
## raw_alpha std.alpha G6(smc) average_r S/N median_r
## 0.74 0.74 0.69 0.48 2.8 0.52
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N
## Artist_Albums_Tracks_Number 0.69 0.69 0.52 0.52 2.18
## Artist_Appearances_Number 0.47 0.47 0.31 0.31 0.88
## Artist_Appearances_Tracks_Number 0.76 0.76 0.62 0.62 3.21
## var.r med.r
## Artist_Albums_Tracks_Number NA 0.52
## Artist_Appearances_Number NA 0.31
## Artist_Appearances_Tracks_Number NA 0.62
##
## Item statistics
## r r.cor r.drop
## Artist_Albums_Tracks_Number 0.79 0.65 0.53
## Artist_Appearances_Number 0.88 0.82 0.70
## Artist_Appearances_Tracks_Number 0.75 0.55 0.46
psych::alpha(cor(z[,vars_4]), check.keys = T) # Artist_Singles_Number, Artist_Singles_Tracks_Number
## Warning in matrix(unlist(drop.item), ncol = 10, byrow = TRUE): Datenlänge [16]
## ist kein Teiler oder Vielfaches der Anzahl der Spalten [10]
##
## Reliability analysis
## Call: psych::alpha(x = cor(z[, vars_4]), check.keys = T)
##
## raw_alpha std.alpha G6(smc) average_r S/N median_r
## 0.78 0.78 0.65 0.65 3.6 0.65
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N var.r
## Artist_Singles_Number 0.65 0.65 0.42 0.65 NA 0.65
## Artist_Singles_Tracks_Number 0.42 0.65 NA NA NA 0.42
## med.r
## Artist_Singles_Number 0.65
## Artist_Singles_Tracks_Number 0.65
##
## Item statistics
## r r.cor r.drop
## Artist_Singles_Number 0.91 0.73 0.65
## Artist_Singles_Tracks_Number 0.91 0.73 0.65
items_1 <- reverse.code((keys[,1] * sign.mat[,1])[(keys[,1] * sign.mat[,1]) != 0], z[, vars_1])
items_2 <- reverse.code((keys[,2] * sign.mat[,2])[(keys[,2] * sign.mat[,2]) != 0], z[, vars_2])
items_3 <- reverse.code((keys[,3] * sign.mat[,3])[(keys[,3] * sign.mat[,3]) != 0], z[, vars_3])
items_4 <- reverse.code((keys[,4] * sign.mat[,4])[(keys[,4] * sign.mat[,4]) != 0], z[, vars_4])
library("additivityTests")
tukey.test(items_1) # result: H0 rejected, i.e. sum score is insufficient to represent the variables (factor 2)
##
## Tukey test on 5% alpha-level:
##
## Test statistic: 16.19
## Critival value: 3.857
## The additivity hypothesis was rejected.
tukey.test(items_2) # result: H0 rejected (factor 3)
##
## Tukey test on 5% alpha-level:
##
## Test statistic: 9.276
## Critival value: 3.853
## The additivity hypothesis was rejected.
tukey.test(items_3) # result: H0 cannot be rejected (factor 4)
##
## Tukey test on 5% alpha-level:
##
## Test statistic: 0.3105
## Critival value: 3.865
## The additivity hypothesis cannot be rejected.
tukey.test(items_4) # result: H0 cannot be rejected (factor 1)
##
## Tukey test on 5% alpha-level:
##
## Test statistic: 0.004627
## Critival value: 3.89
## The additivity hypothesis cannot be rejected.
cor(ml.promax$scores, si$scores)
## one two three four
## ML2 0.99592621 -0.09140514 -0.079332072 -0.002099964
## ML3 -0.09594147 0.97336006 0.416099225 0.487503635
## ML4 -0.11258045 0.39562748 0.960537575 0.200024816
## ML1 0.01023158 0.29765356 -0.008730584 0.882141227
fa_bartlett_scores <- fa(z, nfactors=4, rotate="promax", fm="ml", scores = 'Bartlett')
cor(ml.promax$scores, fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## ML2 0.99999331 -0.1228703 -0.132348818 0.071013967
## ML3 -0.12409086 0.9999497 0.347443877 0.166211182
## ML4 -0.14750946 0.3834347 0.999105855 -0.009335731
## ML1 0.07138876 0.1654449 -0.008420436 0.999492034
cor(si$scores, fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## one 0.995760702 -0.09422492 -0.09953451 0.01023589
## two -0.091504358 0.97260263 0.36373011 0.29006661
## three -0.082455588 0.42220226 0.95965169 -0.03857534
## four -0.001308527 0.48171167 0.16518758 0.87773348
cor(cbind(f2, f3, f4, f1), si$scores) # are almost identical now
## one two three four
## f2 1.00000000 -0.07190494 -0.04542303 -0.04038714
## f3 -0.08246012 0.97220279 0.51683413 0.47961885
## f4 -0.04542303 0.42361028 1.00000000 0.19564730
## f1 -0.04038714 0.62898318 0.19564730 1.00000000
cor(cbind(f2, f3, f4, f1), fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## f2 0.995760702 -0.09422492 -0.09953451 0.01023589
## f3 -0.113694191 0.96801245 0.47604146 0.14747026
## f4 -0.082455588 0.42220226 0.95965169 -0.03857534
## f1 -0.001308527 0.48171167 0.16518758 0.87773348
cor(si$scores, fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## one 0.995760702 -0.09422492 -0.09953451 0.01023589
## two -0.091504358 0.97260263 0.36373011 0.29006661
## three -0.082455588 0.42220226 0.95965169 -0.03857534
## four -0.001308527 0.48171167 0.16518758 0.87773348
# I choose the manually computed scoreItems and sum scores omitting <= 0.5 structure loadings and assigning the item with the larger loading to a single factor in case of ambiguity
par(mfrow=c(1,2))
hist(ml.promax$scores[,1], main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 1], col="blue")
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 1], col="red")
hist(ml.promax$scores[,2], main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 2], col="blue")
rug(ml.promax$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 2], col="red")
par(mfrow=c(1,2))
hist(si$scores[,1], main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(si$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 1], col="blue")
rug(si$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 1], col="red")
hist(si$scores[,2], main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(si$scores[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE), 2], col="blue")
rug(si$scores[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE), 2], col="red")
par(mfrow=c(1,2))
hist(f2, main = 'Youtube popularity factor', xlab = 'Score corresponding to factor 2 (promax)')
rug(f2[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE)], col="blue")
rug(f2[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE)], col="red")
hist(f3, main = 'Music supply factor', xlab = 'Score corresponding to factor 3 (promax)')
rug(f3[ifelse(x[complete_index, 'Charts'] == 1, TRUE, FALSE)], col="blue")
rug(f3[ifelse(x[complete_index, 'Charts'] != 1, TRUE, FALSE)], col="red")
x$factor_2 <- si$scores[,1]
x$factor_3 <- si$scores[,2]
x$factor_4 <- si$scores[,3]
x$factor_1 <- si$scores[,4]
x$factor_2man <- f2
x$factor_3man <- f3
x$factor_4man <- f4
x$factor_1man <- f1
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), cbind(f2, f3, f4, f1))
## f2 f3 f4 f1
## [1,] 1.00000000 -0.08246012 -0.04542303 -0.04038714
## [2,] -0.07190494 0.97220279 0.42361028 0.62898318
## [3,] -0.04542303 0.51683413 1.00000000 0.19564730
## [4,] -0.04038714 0.47961885 0.19564730 1.00000000
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), fa_bartlett_scores$scores)
## ML2 ML3 ML4 ML1
## [1,] 0.995760702 -0.09422492 -0.09953451 0.01023589
## [2,] -0.091504358 0.97260263 0.36373011 0.29006661
## [3,] -0.082455588 0.42220226 0.95965169 -0.03857534
## [4,] -0.001308527 0.48171167 0.16518758 0.87773348
cor(cbind(si$scores[,1], si$scores[,2], si$scores[,3], si$scores[,4]), ml.promax$scores)
## ML2 ML3 ML4 ML1
## [1,] 0.995926214 -0.09594147 -0.1125804 0.010231584
## [2,] -0.091405140 0.97336006 0.3956275 0.297653556
## [3,] -0.079332072 0.41609922 0.9605376 -0.008730584
## [4,] -0.002099964 0.48750364 0.2000248 0.882141227
x$factor_2bartlett <- fa_bartlett_scores$scores[,1]
x$factor_3bartlett <- fa_bartlett_scores$scores[,2]
x$factor_4bartlett <- fa_bartlett_scores$scores[,3]
x$factor_1bartlett <- fa_bartlett_scores$scores[,4]
z <- scale(x_features)
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Elbow method
fviz_nbclust(z, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) + labs(title = "")
set.seed(42)
kz <- kmeans(z, c=4)
k_techno <- names(kz$cluster[kz$cluster == 1])
k_hiphop <- names(kz$cluster[kz$cluster == 2])
k_pop <- names(kz$cluster[kz$cluster == 3])
k_classic <- names(kz$cluster[kz$cluster == 4])
kz$cluster[k_techno] <- 4
kz$cluster[k_pop] <- 3
kz$cluster[k_hiphop] <- 2
kz$cluster[k_classic] <- 1
par(mfcol=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=kz$cluster, pch=19, xlab = 'PC1', ylab = 'PC2')
# project cluster centers to PCA feature space:
cluster_centers_pca <- kz$centers %*% pc_cor$rotation[, 1:2]
points(cluster_centers_pca[,1], cluster_centers_pca[,2], col = 'magenta', pch = 10, cex = 3)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19, xlab = 'PC1', ylab = 'PC2')
library("cluster")
sk_4 <- silhouette(kz$cluster, dist(z))
plot(sk_4, col=1:4, border=NA, ann = T, main = "")
library("caret")
##
## Attaching package: 'caret'
## The following object is masked _by_ '.GlobalEnv':
##
## RMSE
confusionMatrix(as.factor(as.numeric(kz$cluster)), as.factor(as.numeric(x$Genre)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 1 1 0 47
## 2 0 34 30 0
## 3 48 0 3 0
## 4 0 15 17 0
##
## Overall Statistics
##
## Accuracy : 0.1939
## 95% CI : (0.141, 0.2563)
## No Information Rate : 0.2551
## P-Value [Acc > NIR] : 0.9821
##
## Kappa : -0.0767
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.020408 0.6800 0.06000 0.0000
## Specificity 0.673469 0.7945 0.67123 0.7852
## Pos Pred Value 0.020408 0.5312 0.05882 0.0000
## Neg Pred Value 0.673469 0.8788 0.67586 0.7134
## Prevalence 0.250000 0.2551 0.25510 0.2398
## Detection Rate 0.005102 0.1735 0.01531 0.0000
## Detection Prevalence 0.250000 0.3265 0.26020 0.1633
## Balanced Accuracy 0.346939 0.7373 0.36562 0.3926
# Although k-means is not a classification algorithm, we can compute the proportions of genres per cluster to assign
# an interpretation for each cluster.
tab_kmeans <- table(kz$cluster, x$Genre)
#xtable(round(tab_kmeans/colSums(tab_kmeans), digits = 2)*100)
round(tab_kmeans/colSums(tab_kmeans), digits = 2)*100
##
## Classic Hip Hop Pop Techno
## 1 2 2 0 96
## 2 0 68 60 0
## 3 96 0 6 0
## 4 0 32 36 0
# It seems that Classic music and Techno music can be accurately distinguished but between Hip Hop and Pop there are
# many false negatives, e.g. 62% of the tracks were predicted to be of the genre Pop although the truth was that they were
# of class Hip Hop, underlining the ambiguity between those genres in terms of music theoretical characteristics nowadays.
# Vice versa, 16% of the tracks predicted to be Hip Hop music were actually Pop music.
set.seed(42)
cl2 <- pam(z, 4)
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$clustering)
k_classic <- names(cl2$clustering[cl2$clustering == 1])
k_techno <- names(cl2$clustering[cl2$clustering == 2])
k_pop <- names(cl2$clustering[cl2$clustering == 3])
k_hiphop <- names(cl2$clustering[cl2$clustering == 4])
cl2$clustering[k_techno] <- 4
cl2$clustering[k_pop] <- 3
cl2$clustering[k_hiphop] <- 2
cl2$clustering[k_classic] <- 1
par(mfcol=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$clustering, pch=19, xlab = 'PC1', ylab = 'PC2')
points(pc_cor$x[rownames(cl2$medoids),1], pc_cor$x[rownames(cl2$medoids),2], col = 'magenta', pch = 10, cex = 3)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19, xlab = 'PC1', ylab = 'PC2')
# xtable(x[rownames(cl2$medoids), c('Track_Title', 'Track_Artist', 'Genre')])
x[rownames(cl2$medoids), c('Track_Title', 'Track_Artist', 'Genre')]
sm_4 <- silhouette(cl2$clustering, dist(z))
plot(sm_4, col=1:4, border=NA, main = "")
confusionMatrix(as.factor(as.numeric(cl2$clustering)), as.factor(as.numeric(x$Genre)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 48 0 2 0
## 2 0 28 13 0
## 3 0 21 35 0
## 4 1 1 0 47
##
## Overall Statistics
##
## Accuracy : 0.8061
## 95% CI : (0.7437, 0.859)
## No Information Rate : 0.2551
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7415
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.9796 0.5600 0.7000 1.0000
## Specificity 0.9864 0.9110 0.8562 0.9866
## Pos Pred Value 0.9600 0.6829 0.6250 0.9592
## Neg Pred Value 0.9932 0.8581 0.8929 1.0000
## Prevalence 0.2500 0.2551 0.2551 0.2398
## Detection Rate 0.2449 0.1429 0.1786 0.2398
## Detection Prevalence 0.2551 0.2092 0.2857 0.2500
## Balanced Accuracy 0.9830 0.7355 0.7781 0.9933
tab_kmedoids <- table(cl2$clustering, x$Genre)
# xtable((round(tab_kmedoids/colSums(tab_kmedoids), digits = 2))*100)
round(tab_kmedoids/colSums(tab_kmedoids), digits = 2)*100
##
## Classic Hip Hop Pop Techno
## 1 98 0 4 0
## 2 0 56 26 0
## 3 0 42 70 0
## 4 2 2 0 100
# Factor analysis + k-means on scores
library("psych")
KMO(z)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = z)
## Overall MSA = 0.83
## MSA for each item =
## acousticness danceability energy instrumentalness
## 0.83 0.92 0.80 0.67
## liveness loudness speechiness tempo
## 0.81 0.87 0.84 0.82
## valence
## 0.80
scree(z)
library("paran")
paran(z, centile=95, all=T, graph=T) # two factors are appropriate
##
## Using eigendecomposition of correlation matrix.
## Computing: 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
##
##
## Results of Horn's Parallel Analysis for component retention
## 270 iterations, using the 95 centile estimate
##
## --------------------------------------------------
## Component Adjusted Unadjusted Estimated
## Eigenvalue Eigenvalue Bias
## --------------------------------------------------
## 1 3.846102 4.305594 0.459491
## 2 1.166438 1.462154 0.295715
## 3 0.791006 0.979100 0.188093
## 4 0.708123 0.813018 0.104895
## 5 0.657852 0.694514 0.036661
## 6 0.328989 0.301951 -0.02703
## 7 0.317386 0.222367 -0.09501
## 8 0.304598 0.140507 -0.16409
## 9 0.313412 0.080790 -0.23262
## --------------------------------------------------
##
## Adjusted eigenvalues > 1 indicate dimensions to retain.
## (2 components retained)
fa <- fa(z, nfactors=2, rotate="varimax")
library("plot.matrix")
## Registered S3 method overwritten by 'plot.matrix':
## method from
## plot.loadings pls
plot(loadings(fa), las=1)
al1 <- psych::alpha(z[,c("acousticness", "danceability", "energy", "loudness")], check.keys = TRUE)
## Warning in psych::alpha(z[, c("acousticness", "danceability", "energy", : Some items were negatively correlated with total scale and were automatically reversed.
## This is indicated by a negative sign for the variable name.
al1
##
## Reliability analysis
## Call: psych::alpha(x = z[, c("acousticness", "danceability", "energy",
## "loudness")], check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.95 0.95 0.94 0.82 18 0.0062 -0.6 0.93 0.81
##
## lower alpha upper 95% confidence boundaries
## 0.94 0.95 0.96
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## acousticness- 0.93 0.93 0.90 0.81 13 0.0092 0.00225 0.80
## danceability 0.94 0.94 0.93 0.85 16 0.0073 0.00298 0.86
## energy 0.92 0.92 0.88 0.79 11 0.0102 0.00058 0.79
## loudness 0.94 0.94 0.92 0.84 15 0.0078 0.00246 0.81
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## acousticness- 196 0.94 0.94 0.92 0.89 -2.4e+00 1
## danceability 196 0.91 0.91 0.86 0.84 -5.5e-18 1
## energy 196 0.96 0.96 0.95 0.92 1.6e-16 1
## loudness 196 0.92 0.92 0.88 0.85 6.0e-17 1
al2 <- psych::alpha(z[,c("instrumentalness", "valence")], check.keys = TRUE)
## Warning in psych::alpha(z[, c("instrumentalness", "valence")], check.keys = TRUE): Some items were negatively correlated with total scale and were automatically reversed.
## This is indicated by a negative sign for the variable name.
## Warning in matrix(unlist(drop.item), ncol = 10, byrow = TRUE): Datenlänge [16]
## ist kein Teiler oder Vielfaches der Anzahl der Spalten [10]
al2
##
## Reliability analysis
## Call: psych::alpha(x = z[, c("instrumentalness", "valence")], check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.79 0.79 0.65 0.65 3.7 0.03 0.62 0.91 0.65
##
## lower alpha upper 95% confidence boundaries
## 0.73 0.79 0.85
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r
## instrumentalness- 0.65 0.65 0.42 0.65 NA NA 0.65
## valence 0.42 0.65 NA NA NA NA 0.42
## med.r
## instrumentalness- 0.65
## valence 0.65
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## instrumentalness- 196 0.91 0.91 0.73 0.65 1.2e+00 1
## valence 196 0.91 0.91 0.73 0.65 1.4e-17 1
scale <- cbind(al1$scores, al2$scores)
fviz_nbclust(scale, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
reorder <- function(x, ...) {
dr <- dist(x)
hr <- hclust(dr)
dc <- dist(t(x))
hc <- hclust(dc)
x[hr$order, hc$order]
}
set.seed(42)
cl1 <- kmeans(z, 4)
set.seed(42)
cl2 <- kmeans(scale, 4)
print(reorder(table(cl1$cluster,cl2$cluster)))
##
## 1 3 2 4
## 3 51 0 0 0
## 4 0 49 0 0
## 1 0 0 14 18
## 2 0 0 42 22
par(mfcol=c(1,3))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl1$cluster, pch=19)
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$cluster, pch=19)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre), pch=19)
par(mfcol=c(1,2))
library("e1071")
set.seed(42)
cl1 <- cmeans(z, 4)
mcolor <- colorRamp(c("black", "green", "blue", "red"))
col <- rgb(mcolor(cl1$membership[,1]), max=255)
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col, main = 'Soft-clustering (fuzzy c-means, k = 4)')
cl <- diana(z)
hcl <- cutree(as.hclust(cl), k = 4)
par(mfrow=c(1,1))
plot(pc_cor$x[,1], pc_cor$x[,2], col=hcl,
pch=19, cex=0.5)
library("proxy")
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
d <- as.matrix(dist(z))
heatmap(d)
s <- pr_dist2simil(d) # the darker, the more similar
heatmap(s)
par(mfcol=c(1,1))
d <- dist(z)
# hclust
cl1 <- hclust(d, method = 'ward.D2')
memb <- cutree(cl1, 4)
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb)
plot(cl1, ann = F)
title(xlab = "Ward.D2", ylab = "Height")
library("ggplot2")
library("tibble")
ggplot(cl1$height %>%
as_tibble() %>%
add_column(groups = length(cl1$height):1) %>%
rename(height=value),
aes(x=groups, y=height)) +
geom_point() +
geom_line()
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
library("mclust")
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:MVN':
##
## mvn
## The following object is masked from 'package:psych':
##
## sim
set.seed(42)
cl <- Mclust(z)
summary(cl)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVI (diagonal, varying volume and shape) model with 9 components:
##
## log-likelihood n df BIC ICL
## -232.2578 196 170 -1361.795 -1368.181
##
## Clustering table:
## 1 2 3 4 5 6 7 8 9
## 9 23 7 35 7 65 24 8 18
par(mfrow=c(2,2))
plot(cl, "BIC")
plot(pc_cor$x[,1], pc_cor$x[,2], col = cl$classification)
EM_density_plot <- plot(cl, "density")
par(mfcol=c(2,2))
library("fpc")
##
## Attaching package: 'fpc'
## The following object is masked from 'package:xtable':
##
## xtable
set.seed(42)
cl <- dbscan(z, 0.5, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
set.seed(42)
cl <- dbscan(z, 1, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
set.seed(42)
cl <- dbscan(z, 1.5, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
set.seed(42)
cl <- dbscan(z, 2, scale=F, MinPts = 5)
col <- c('grey', rainbow(max(cl$cluster)))
plot(pc_cor$x[,1], pc_cor$x[,2], pch=19, col=col[1+cl$cluster])
# Mixed clustering
# hclust
d <- dist(z)
set.seed(42)
cl1 <- hclust(d, method="ward.D2")
memb <- cutree(cl1, 4)
# kmeans
groupm <- aggregate(z, list(memb), mean)
centers <- cbind(groupm$acousticness,
groupm$danceability,
groupm$energy,
groupm$instrumentalness,
groupm$liveness,
groupm$loudness,
groupm$speechiness,
groupm$tempo,
groupm$valence)
set.seed(42)
cl2 <- kmeans(z, centers = centers)
# compare results (confusion matrix)
table(memb, cl2$cluster) # result: k-means has classified 4 observations differently than hclust
##
## memb 1 2 3 4
## 1 51 0 0 1
## 2 0 49 0 0
## 3 0 0 7 0
## 4 0 0 3 85
par(mfcol=c(1,2))
plot(pc_cor$x[,1], pc_cor$x[,2], col=cl2$cluster,
main="Cluster predictions, mixed clustering", pch=19, xlab = 'PC1', ylab = 'PC2')
# project cluster centers to PCA feature space:
cluster_centers_pca <- cl2$centers %*% pc_cor$rotation[, 1:2]
points(cluster_centers_pca[,1], cluster_centers_pca[,2], col = 'magenta', pch = 10, cex = 3)
plot(pc_cor$x[,1], pc_cor$x[,2], col=as.numeric(x$Genre),
main="Truth", pch=19, xlab = 'PC1', ylab = 'PC2')
library("NbClust")
# Total variance explained
tve <- rep(NA, 15)
for (k in 2:15) {
clk <- kmeans(z, k)
tve[k] <- 1-clk$tot.withinss/clk$totss
}
plot(tve, type="b")
set.seed(42)
NbClust(z, method="ward.D2", index="ch") # result: k*=2
## $All.index
## 2 3 4 5 6 7 8 9
## 139.7710 120.4259 98.3300 88.7335 82.5706 74.8123 69.3249 65.2113
## 10 11 12 13 14 15
## 61.9272 59.6528 58.1524 57.0914 56.0043 55.3023
##
## $Best.nc
## Number_clusters Value_Index
## 2.000 139.771
##
## $Best.partition
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 2 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 141 142 143 145 146 147 148 149 150 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## 163 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 183 184
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
set.seed(42)
NbClust(z, method="ward.D2") # result: majority vote (12) for k*=3.
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 12 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 2.9225 139.7710 59.4773 3.0748 463.1206 4.072864e+17 21596.043 1020.0706
## 3 3.5131 120.4259 24.7670 6.5366 826.6680 1.433964e+17 11111.650 780.7158
## 4 0.8659 98.3300 24.3660 7.3358 952.5966 1.340864e+17 8423.499 691.9237
## 5 1.1693 88.7335 21.0237 9.1153 1104.3665 9.658613e+16 6515.744 614.0028
## 6 2.4339 82.5706 12.1013 11.0072 1213.6275 7.964860e+16 4756.855 553.1201
## 7 0.9542 74.8123 11.5500 10.5538 1366.8443 4.961074e+16 4347.806 520.0008
## 8 1.0040 69.3249 10.9477 10.3915 1447.6622 4.290265e+16 3883.982 490.0530
## 9 1.0561 65.2113 10.1985 10.3909 1533.7264 3.500170e+16 3428.487 463.0863
## 10 0.8937 61.9272 10.6113 10.4543 1598.4194 3.106406e+16 2948.095 439.1369
## 11 0.9214 59.6528 11.0369 10.7463 1689.2379 2.364879e+16 2610.542 415.4363
## 12 0.9994 58.1524 10.9827 11.8133 1812.3952 1.501397e+16 2480.136 392.0472
## 13 1.1530 57.0914 9.8993 12.5013 1911.2783 1.063937e+16 2136.940 369.9646
## 14 0.9733 56.0043 10.0901 13.0538 1965.2543 9.368841e+15 1826.549 350.9785
## 15 1.3913 55.3023 7.9632 13.7026 2032.3638 7.636806e+15 1567.078 332.5424
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 21.7400 1.7205 0.3120 0.9783 0.4265 0.7137 56.9638 2.3921 0.4113
## 3 28.4878 2.2479 0.2692 1.1655 0.3482 0.8329 18.6544 1.1917 0.4042
## 4 29.5447 2.5364 0.3313 1.1514 0.3628 0.8040 20.9664 1.4472 0.3785
## 5 31.1993 2.8583 0.2878 1.4486 0.3458 0.6822 16.3044 2.7196 0.3548
## 6 32.3856 3.1729 0.3217 1.2612 0.3517 0.8201 10.9717 1.2919 0.3361
## 7 37.4071 3.3750 0.3024 1.4710 0.2876 0.7661 14.9600 1.7967 0.3153
## 8 39.0800 3.5812 0.2885 1.5297 0.2588 0.6165 8.0877 3.4690 0.2988
## 9 40.7503 3.7898 0.2840 1.4543 0.2668 0.6327 27.2817 3.4130 0.2849
## 10 42.0453 3.9965 0.2731 1.4046 0.2679 0.6075 12.9223 3.6951 0.2731
## 11 44.1287 4.2245 0.2665 1.3700 0.2704 0.8028 9.5795 1.4381 0.2628
## 12 48.7617 4.4765 0.2489 1.3929 0.2545 0.7169 11.0589 2.2899 0.2538
## 13 52.5702 4.7437 0.2390 1.3446 0.2621 0.6343 12.6834 3.3114 0.2459
## 14 53.4624 5.0003 0.2303 1.2754 0.2686 1.7043 -2.0662 -2.0679 0.2387
## 15 54.9323 5.2775 0.2808 1.2230 0.2738 0.6088 6.4256 3.5078 0.2322
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 510.0353 0.6596 1.5894 0.3791 0.1672 0.0014 1.3874 2.1143 0.6318
## 3 260.2386 0.5824 -0.0150 1.0162 0.1346 0.0014 1.4240 1.7562 0.5419
## 4 172.9809 0.6279 0.6392 1.0868 0.1782 0.0015 1.6964 1.6896 0.6632
## 5 122.8006 0.5917 0.1784 1.6348 0.1280 0.0016 1.9833 1.5926 0.7320
## 6 92.1867 0.5988 0.3878 1.7244 0.1504 0.0017 1.7666 1.5282 0.5752
## 7 74.2858 0.5900 0.9796 1.9096 0.1504 0.0018 1.8074 1.4831 0.5428
## 8 61.2566 0.5491 0.0245 2.3354 0.1264 0.0018 2.0264 1.4334 0.4935
## 9 51.4540 0.5521 0.2609 2.3378 0.1264 0.0018 1.9033 1.4024 0.4559
## 10 43.9137 0.5485 0.1750 2.4413 0.1264 0.0018 1.8289 1.3667 0.4188
## 11 37.7669 0.5484 0.3502 2.4821 0.1264 0.0019 1.7465 1.3323 0.3863
## 12 32.6706 0.5333 0.2567 2.7392 0.1264 0.0020 1.7621 1.2967 0.3768
## 13 28.4588 0.5279 0.1186 2.8500 0.1264 0.0021 1.6812 1.2591 0.3519
## 14 25.0699 0.5285 -0.0290 2.8823 0.1264 0.0021 1.6455 1.2326 0.3438
## 15 22.1695 0.5298 0.1067 2.8742 0.1535 0.0022 1.8933 1.2092 0.2842
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.8094 33.4492 0.0110
## 3 0.7816 25.9821 0.2967
## 4 0.7759 24.8432 0.1638
## 5 0.6927 15.5269 0.0046
## 6 0.7297 18.5198 0.2388
## 7 0.7278 18.3290 0.0668
## 8 0.5577 10.3089 0.0008
## 9 0.7237 17.9442 0.0005
## 10 0.6225 12.1297 0.0003
## 11 0.7045 16.3556 0.1702
## 12 0.6665 14.0075 0.0174
## 13 0.6355 12.6164 0.0009
## 14 0.3854 7.9739 1.0000
## 15 0.5139 9.4601 0.0009
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 3.0000 2.000 3.0000 15.0000 3.0000 3.0000e+00 3.00
## Value_Index 3.5131 139.771 34.7103 13.7026 363.5475 2.5458e+17 10484.39
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 3.0000 3.000 14.0000 2.0000 2.0000 3.0000
## Value_Index 150.5628 6.7478 -0.239 0.2303 0.9783 0.4265 0.8329
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 3.0000 3.0000 2.0000 3.0000 2.0000 2.0000 2.0000
## Value_Index 18.6544 1.1917 0.4113 249.7967 0.6596 1.5894 0.3791
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 4.0000 0 2.0000 0 15.0000
## Value_Index 0.1782 0 1.3874 0 0.2842
##
## $Best.partition
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 1 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 1 3 3 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 141 142 143 145 146 147 148 149 150 152 153 154 155 156 157 158 159 160 161 162
## 1 1 1 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## 163 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 183 184
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1
# Silhouette method
set.seed(42)
fviz_nbclust(z, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
d <- dist(z)
cl1 <- hclust(d, method="ward.D2")
memb2 <- cutree(cl1, 2)
memb3 <- cutree(cl1, 3)
memb4 <- cutree(cl1, 4)
library("cluster")
par(mfcol=c(2,2))
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb2)
s2 <- silhouette(memb2, d)
plot(s2, col=1:2, border=NA)
plot(pc_cor$x[,1], pc_cor$x[,2], col=memb3)
s3 <- silhouette(memb3, d)
plot(s3, col=1:3, border=NA)
par(mfcol=c(1,3))
clusplot(z, memb2, col.p=memb2)
clusplot(z, memb3, col.p=memb3)
clusplot(z, memb4, col.p=memb4)
library("cluster")
set.seed(42)
cl2 <- pam(z, 4)
k_classic <- names(cl2$clustering[cl2$clustering == 1])
k_techno <- names(cl2$clustering[cl2$clustering == 2])
k_pop <- names(cl2$clustering[cl2$clustering == 3])
k_hiphop <- names(cl2$clustering[cl2$clustering == 4])
cl2$clustering[k_techno] <- 4
cl2$clustering[k_pop] <- 3
cl2$clustering[k_hiphop] <- 2
cl2$clustering[k_classic] <- 1
x$cluster <- as.factor(cl2$clustering)
x$Artist_Follower <- x$Artist_Follower/1000000
x$fa_1 <- x$factor_1
x$fa_2 <- x$factor_2
x$fa_3 <- x$factor_3
x$fa_4 <- x$factor_4
par(mfrow=c(3,3))
nx <- c("fa_1", "fa_2", "fa_3", "fa_4", "pc_1", "pc_2", "Artist_Follower", "days_release_orig", "Track_Popularity")
for (i in 1:length(nx)) {
lmi <- lm(as.formula(paste('Artist_Popularity', nx[i], sep="~")), data=x)
summary(lmi)
plot(x[,nx[i]], x[,'Artist_Popularity'], xlab=nx[i], ylab='Artist Popularity', main=sprintf("R^2=%.2f", summary(lmi)$r.squared))
abline(lmi, col="red")
}
model1 <- lm(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
summary(model1)
##
## Call:
## lm(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 +
## pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.4051 -6.6722 0.3102 7.0904 21.0544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.5143111 0.8690218 74.238 < 2e-16 ***
## fa_1 -0.0816610 1.1386538 -0.072 0.942904
## fa_2 1.8480927 0.7571880 2.441 0.015589 *
## fa_3 5.4094057 1.3919701 3.886 0.000141 ***
## fa_4 -1.2986520 1.2829029 -1.012 0.312714
## pc_1 1.4023625 0.5912068 2.372 0.018706 *
## pc_2 7.4542956 0.6570917 11.344 < 2e-16 ***
## Artist_Follower 0.8090864 0.1199206 6.747 1.84e-10 ***
## days_release_orig -0.0004646 0.0002093 -2.220 0.027631 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.765 on 187 degrees of freedom
## Multiple R-squared: 0.6364, Adjusted R-squared: 0.6208
## F-statistic: 40.91 on 8 and 187 DF, p-value: < 2.2e-16
# fa_2: "Youtube"
# fa_3: "Artist content supply"
# fa_4: "Artist long-term activity"
# fa_1: "One hit wonder"
# pc_1 (between classic and Pop, Hip Hop, Techno): strong negative loadings: acousticness and instrumentalness, strong positive loadings: danceability, energy and loudness
# pc_2 (between Techno and Pop, Hip Hop): strong negative loadings: instrumentalness and tempo, strong positive loadings: speechiness and valence
vif(model1) # fa_1, fa_3, fa_4, pc_1 mild collinearity
## fa_1 fa_2 fa_3 fa_4
## 2.181823 1.127092 2.881682 2.202112
## pc_1 pc_2 Artist_Follower days_release_orig
## 3.077405 1.290976 1.276809 1.607455
library("perturb") # condition index
colldiag(model1)
## Condition
## Index Variance Decomposition Proportions
## intercept fa_1 fa_2 fa_3 fa_4 pc_1 pc_2 Artist_Follower
## 1 1.000 0.002 0.017 0.003 0.030 0.035 0.031 0.000 0.000
## 2 1.314 0.106 0.011 0.066 0.002 0.000 0.002 0.071 0.129
## 3 1.462 0.003 0.135 0.054 0.013 0.014 0.008 0.093 0.103
## 4 1.562 0.203 0.000 0.220 0.006 0.015 0.004 0.141 0.006
## 5 1.914 0.002 0.036 0.381 0.041 0.049 0.017 0.373 0.025
## 6 2.154 0.035 0.007 0.265 0.055 0.194 0.002 0.031 0.405
## 7 2.926 0.635 0.006 0.006 0.000 0.069 0.042 0.099 0.121
## 8 3.154 0.000 0.412 0.007 0.169 0.323 0.301 0.085 0.204
## 9 3.912 0.015 0.376 0.000 0.683 0.301 0.593 0.107 0.007
## days_release_orig
## 1 0.029
## 2 0.036
## 3 0.023
## 4 0.030
## 5 0.005
## 6 0.048
## 7 0.739
## 8 0.069
## 9 0.022
# I remove some variables according to insignificance (fa_1, fa_4)
model2 <- lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
# summary(model2)
vif(model2)
## fa_2 fa_3 pc_1 pc_2
## 1.126103 1.674037 2.021619 1.082232
## Artist_Follower days_release_orig
## 1.177174 1.510313
select.cols <- c("Artist_Popularity", "fa_2", "fa_3", "pc_1", "pc_2", "Artist_Follower", "days_release_orig")
z <- as.data.frame(scale(dplyr::select(x, select.cols)))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(select.cols)` instead of `select.cols` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
model2_z <- lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = z)
# summary(model2_z)
vif(model2_z)
## fa_2 fa_3 pc_1 pc_2
## 1.126103 1.674037 2.021619 1.082232
## Artist_Follower days_release_orig
## 1.177174 1.510313
model2_z$coefficients^2
## (Intercept) fa_2 fa_3 pc_1
## 8.251688e-33 1.318360e-02 8.530193e-02 5.212851e-02
## pc_2 Artist_Follower days_release_orig
## 3.226773e-01 1.063958e-01 1.819168e-02
sum(model2_z$coefficients^2) # 60% is the proportion in the variance of y explained by X and 40% of the variance of y is due to the variance of the residuals outside of the model IFF there was zero multicollinearity. In terms of relevance, 32% of variance in y is explained by pc_2, 10% by Artist_Follower, 8% by fa_3
## [1] 0.5978788
sort(round((model2_z$coefficients)^2, digits = 4), decreasing = T)
## pc_2 Artist_Follower fa_3 pc_1
## 0.3227 0.1064 0.0853 0.0521
## days_release_orig fa_2 (Intercept)
## 0.0182 0.0132 0.0000
# Quadratic specification seems appropriate:
# variables without and with quadratic terms (fa_2, pc_2, Artist_Follower)
par(mfcol=c(2,3))
lm_fa2 <- lm(Artist_Popularity ~ fa_2, data=x)
plot(x[,'fa_2'], x[,'Artist_Popularity'], xlab='fa_2', main=sprintf("R^2=%.2f", summary(lm_fa2)$r.squared),
ylim = c(min(min(fitted(lm_fa2)), min(x$Artist_Popularity)), max(max(fitted(lm_fa2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_fa2, col="red")
lmq_fa2 <- lm(Artist_Popularity ~ poly(fa_2, 2), data=x)
o <- order(x$fa_2)
plot(x[,'fa_2'], x[,'Artist_Popularity'], xlab= 'fa_2 + fa_2^2', main=sprintf("R^2=%.2f", summary(lmq_fa2)$r.squared),
ylim = c(min(min(fitted(lmq_fa2)), min(x$Artist_Popularity)), max(max(fitted(lmq_fa2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$fa_2[o], fitted(lmq_fa2)[o], col="red")
lm_follower <- lm(Artist_Popularity ~ Artist_Follower, data=x)
plot(x[,'Artist_Follower'], x[,'Artist_Popularity'], xlab='Artist_Follower', main=sprintf("R^2=%.2f", summary(lm_follower)$r.squared),
ylim = c(min(min(fitted(lm_follower)), min(x$Artist_Popularity)), max(max(fitted(lm_follower)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_follower, col="red")
lmq_follower <- lm(Artist_Popularity ~ poly(Artist_Follower, 2), data=x)
o <- order(x$Artist_Follower)
plot(x[,'Artist_Follower'], x[,'Artist_Popularity'], xlab= 'Artist_Follower + Artist_Follower^2', main=sprintf("R^2=%.2f", summary(lmq_follower)$r.squared),
ylim = c(min(min(fitted(lmq_follower)), min(x$Artist_Popularity)), max(max(fitted(lmq_follower)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$Artist_Follower[o], fitted(lmq_follower)[o], col="red")
lm_pc2 <- lm(Artist_Popularity ~ pc_2, data=x)
plot(x[,'pc_2'], x[,'Artist_Popularity'], xlab='pc_2', main=sprintf("R^2=%.2f", summary(lm_pc2)$r.squared),
ylim = c(min(min(fitted(lm_pc2)), min(x$Artist_Popularity)), max(max(fitted(lm_pc2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
abline(lm_pc2, col="red")
lmq_pc2 <- lm(Artist_Popularity ~ poly(pc_2, 2), data=x)
o <- order(x$pc_2)
plot(x[,'pc_2'], x[,'Artist_Popularity'], xlab= 'pc_2 + pc_2^2', main=sprintf("R^2=%.2f", summary(lmq_pc2)$r.squared),
ylim = c(min(min(fitted(lmq_pc2)), min(x$Artist_Popularity)), max(max(fitted(lmq_pc2)), max(x$Artist_Popularity))), ylab = 'Artist_Popularity')
lines(x$pc_2[o], fitted(lmq_pc2)[o], col="red")
# Residuals normality
ks.test(model1$residuals, "pnorm", mean = mean(model1$residuals), sd=sd(model1$residuals))$statistic
## D
## 0.04757442
1.3581 / sqrt(length(model1$residuals))
## [1] 0.09700714
ks.test(model2$residuals, "pnorm", mean = mean(model2$residuals), sd=sd(model2$residuals))$statistic
## D
## 0.05823339
1.3581 / sqrt(length(model2$residuals))
## [1] 0.09700714
par(mfcol=c(2,2))
plot(model2)
par(mfcol=c(2,3))
nx <- names(model2$model)
for (i in 2:length(model2$model)) {
plot(model2$model[,i], residuals(model2), xlab=nx[i])
lines(lowess(model2$model[,i], residuals(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)
}
par(mfcol=c(1,1))
plot(model2$fitted.values, residuals(model2))
lines(lowess(model2$fitted.values, residuals(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)
# residuals vs. predictors and fitted values suggests nonlinearity of the regression function
par(mfcol=c(2,3))
nx <- names(model2$model)
for (i in 2:length(model2$model)) {
plot(model2$model[,i], rstandard(model2), xlab=nx[i])
lines(lowess(model2$model[,i], rstandard(model2)), col = "red")
# wenn Linie von 0 abweicht: Residuen sind biased, sprich deren Mittelwert nicht 0
abline(h = 0, col = "black", lty = 2)
}
# standardized residuals vs. fitted values
par(mfcol=c(1,1))
plot(model2$fitted.values, rstandard(model2), main = 'Standardized residuals vs. fitted values')
lines(lowess(model2$fitted.values, rstandard(model2)), col = "red")
abline(h = 0, col = "black", lty = 2)
# Breusch-Pagan Test + Durbin-Watson Test / spherical errors (i.e. iid)
# model2 uses unscaled data
auxiliary_reg <- lm(model2$residuals^2 ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
summary(auxiliary_reg)
##
## Call:
## lm(formula = model2$residuals^2 ~ fa_2 + fa_3 + pc_1 + pc_2 +
## Artist_Follower + days_release_orig, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -209.99 -69.77 -26.75 39.15 712.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.093388 9.620134 9.573 < 2e-16 ***
## fa_2 0.973183 8.480588 0.115 0.90876
## fa_3 -29.748317 11.887822 -2.502 0.01318 *
## pc_1 -8.915701 5.369201 -1.661 0.09847 .
## pc_2 4.904291 6.741245 0.728 0.46782
## Artist_Follower 3.868671 1.290221 2.998 0.00308 **
## days_release_orig -0.004154 0.002273 -1.828 0.06918 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109.4 on 189 degrees of freedom
## Multiple R-squared: 0.1067, Adjusted R-squared: 0.0783
## F-statistic: 3.761 on 6 and 189 DF, p-value: 0.001467
# R^2: 0.1067
# N = 196, N = nrow(x)
# test stat:
summary(auxiliary_reg)$r.square * nrow(x)
## [1] 20.90497
# test stat: 20.90497
p <- ncol(model2$model)-1
qchisq(.95, df = p) # df = number of parameters p (without the constant!), acc. to Klinke: q = 5*(5+3)/2 WTF?
## [1] 12.59159
# p_value = P(z > 12.59159) = 1-pchisq(20.90497, p=6) = 0.001908146 < 0.05 , reject H0 of homoskedasticity and conclude there is heteroskedasticity
1-pchisq(20.90497, p)
## [1] 0.001908146
library("lmtest")
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 20.905, df = 6, p-value = 0.001908
# test statistic n*R^2_aux > X^2_crit = qchisq(.95, df = 5), hence reject H0 and conclude that there's
# heteroskedasticity and constancy of error variance does not hold --> inference invalidated
dwtest(model2)
##
## Durbin-Watson test
##
## data: model2
## DW = 1.9149, p-value = 0.2323
## alternative hypothesis: true autocorrelation is greater than 0
# Durbin Watson test confirms that the error variance-covariance is diagonal, i.e. var(eps | X) = \Omega * I
# fail to reject H0: rho = 0 (alternative hypothesis is that there is autocorrelation in the errors)
# outlier detection / leverage
par(mfcol=c(1,2))
plot(hatvalues(model2), pch=19, main="Leverage", cex=0.5)
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=(1:3)*(p+1)/n, col=c("black", "darkred", "red"))
plot(cooks.distance(model2), pch=19, main="Cook's distances",
cex=0.5)
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=4/n, col="red")
outlier_index_leverage <- as.numeric(rownames(x[hatvalues(model2) > 3*(p+1)/n, ]))
outlier_index_final <- outlier_index_leverage
length(outlier_index_leverage)
## [1] 9
# There are 10 outliers that might be influential on the regression
x[outlier_index_leverage, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# Rule of thumb Cook's distance
# movement of regression coefficients all together if ith observation is excluded
outlier_index_cooksd <- as.numeric(rownames(x[cooks.distance(model2) > 3*(p+1)/n, ]))
length(outlier_index_cooksd)
## [1] 2
x[outlier_index_cooksd, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# differences
# SDBETA
SDBETA <- dfbetas(model2)
n <- nrow(model2$model)
par(mfcol=c(2,3))
plot(SDBETA[, 'fa_2'], main="fa_2", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_3'], main="fa_3", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_1'], main="pc_1", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2'], main="pc_2", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Follower'], main="Artist_Follower", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'days_release_orig'], main="days_release_orig", pch=19)
abline(h=c(-2,2)/sqrt(n), col="red")
# SDFITS
par(mfcol=c(1,1))
SDFITS <- dffits(model2)
plot(SDFITS, pch=19)
abline(h=c(-1,1), col="red")
n <- nrow(model2$model)
p <- ncol(model2$model)-1
abline(h=c(-2,2)*sqrt(p/n), col="darkred")
# As this is a small data set, it is sufficient to analyze |\delta*y_i| > 1, in this case two observations are identified
# of exceeding this limit. They are
outlier_index_sdfits <- as.numeric(names(which(abs(SDFITS) > 1)))
x[outlier_index_sdfits, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# Add quadratic terms for fa_2, Artist_Follower and pc_2 first before excluding outliers!
x$fa_2sq <- x$fa_2^2
x$pc_2sq <- x$pc_2^2
x$Artist_Followersq <- x$Artist_Follower^2
# model2: lm(Artist_Popularity ~ fa_2 + fa_3 + pc_1 + pc_2 + Artist_Follower + days_release_orig, data = x)
model3 <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x)
summary(model3)
##
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 +
## pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig,
## data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.3742 -5.6832 0.3771 4.9769 19.6356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.5611323 0.9877844 68.397 < 2e-16 ***
## fa_2 6.0858820 1.7276487 3.523 0.000538 ***
## fa_2sq -0.5504304 0.1615332 -3.408 0.000803 ***
## fa_3 4.4969138 0.9010981 4.990 1.38e-06 ***
## pc_1 1.8133635 0.4088899 4.435 1.57e-05 ***
## pc_2 5.9511646 0.5374732 11.072 < 2e-16 ***
## pc_2sq -2.1818759 0.3886134 -5.615 7.08e-08 ***
## Artist_Follower 1.6608731 0.2291803 7.247 1.10e-11 ***
## Artist_Followersq -0.0233799 0.0049095 -4.762 3.84e-06 ***
## days_release_orig -0.0004777 0.0001710 -2.794 0.005745 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.206 on 186 degrees of freedom
## Multiple R-squared: 0.7446, Adjusted R-squared: 0.7322
## F-statistic: 60.25 on 9 and 186 DF, p-value: < 2.2e-16
vif(model3)
## fa_2 fa_2sq fa_3 pc_1
## 8.309381 7.584394 1.710158 2.084608
## pc_2 pc_2sq Artist_Follower Artist_Followersq
## 1.223166 1.286026 6.603869 5.824194
## days_release_orig
## 1.519144
ks.test(model3$residuals, "pnorm", mean = mean(model3$residuals), sd=sd(model3$residuals))$statistic
## D
## 0.0367905
1.3581 / sqrt(length(model3$residuals))
## [1] 0.09700714
par(mfcol=c(2,2))
plot(model2)
par(mfcol=c(3,3))
nx <- names(model3$model)
for (i in 2:length(model3$model)) {
plot(model3$model[,i], residuals(model3), xlab=nx[i])
lines(lowess(model3$model[,i], residuals(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)
}
par(mfcol=c(1,1))
plot(model3$fitted.values, residuals(model3))
lines(lowess(model3$fitted.values, residuals(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)
# residuals vs. predictors and fitted values suggests accounting for nonlinearity of the regression function has improved
par(mfcol=c(3,3))
nx <- names(model3$model)
for (i in 2:length(model3$model)) {
plot(model3$model[,i], rstandard(model3), xlab=nx[i])
lines(lowess(model3$model[,i], rstandard(model3)), col = "red")
# wenn Linie von 0 abweicht: Residuen sind biased, sprich deren Mittelwert nicht 0
abline(h = 0, col = "black", lty = 2)
}
# standardized residuals vs. fitted values
par(mfcol=c(1,1))
plot(model3$fitted.values, rstandard(model3), main = 'Standardized residuals vs. fitted values')
lines(lowess(model3$fitted.values, rstandard(model3)), col = "red")
abline(h = 0, col = "black", lty = 2)
# Breusch-Pagan Test + Durbin-Watson Test / spherical errors (i.e. iid)
# model2 uses unscaled data
auxiliary_reg <- lm(model3$residuals^2 ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x)
summary(auxiliary_reg)
##
## Call:
## lm(formula = model3$residuals^2 ~ fa_2 + fa_2sq + fa_3 + pc_1 +
## pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig,
## data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.32 -52.32 -23.00 18.34 719.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.727326 11.223118 7.015 4.15e-11 ***
## fa_2 2.535328 19.629390 0.129 0.8974
## fa_2sq -0.474656 1.835326 -0.259 0.7962
## fa_3 -24.009503 10.238196 -2.345 0.0201 *
## pc_1 -6.105726 4.645771 -1.314 0.1904
## pc_2 -9.200191 6.106722 -1.507 0.1336
## pc_2sq -4.239332 4.415390 -0.960 0.3382
## Artist_Follower -1.901275 2.603926 -0.730 0.4662
## Artist_Followersq 0.021242 0.055781 0.381 0.7038
## days_release_orig -0.002471 0.001942 -1.272 0.2050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93.24 on 186 degrees of freedom
## Multiple R-squared: 0.06758, Adjusted R-squared: 0.02246
## F-statistic: 1.498 on 9 and 186 DF, p-value: 0.1513
# R^2: 0.06758
# N = 196, N = nrow(x)
# test stat:
summary(auxiliary_reg)$r.square * nrow(x)
## [1] 13.24499
# test stat: 13.24499
p <- ncol(model3$model)-1
qchisq(.95, df = p) # df = number of parameters p (without the constant!), acc. to Klinke: q = 5*(5+3)/2 WTF?
## [1] 16.91898
# p_value = P(z > 16.91898) = 1-pchisq(13.24499, p=9) = 0.1518304 > 0.05 , fail to reject H0 of homoskedasticity, i.e. constant variance of error term
1-pchisq(13.24499, p)
## [1] 0.1518304
library("lmtest")
bptest(model3)
##
## studentized Breusch-Pagan test
##
## data: model3
## BP = 13.245, df = 9, p-value = 0.1518
# test statistic n*R^2_aux > X^2_crit = qchisq(.95, df = 5), hence fail to reject H0 and conclude that there's
# no heteroskedasticity --> valid inference
dwtest(model3)
##
## Durbin-Watson test
##
## data: model3
## DW = 2.0528, p-value = 0.5856
## alternative hypothesis: true autocorrelation is greater than 0
# Durbin Watson test confirms that the error variance-covariance is diagonal, i.e. var(eps | X) = \Omega * I
# fail to reject H0: rho = 0 (alternative hypothesis is that there is autocorrelation in the errors)
# outlier detection / leverage
par(mfcol=c(1,1))
plot(hatvalues(model3), pch=19, cex=0.5)
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=(1:3)*(p+1)/n, col=c("black", "darkred", "red"))
plot(cooks.distance(model3), pch=19,
cex=0.5)
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=4/n, col="red")
outlier_index_leverage <- as.numeric(rownames(x[hatvalues(model3) > 3*(p+1)/n, ]))
outlier_index_final <- outlier_index_leverage
length(outlier_index_leverage)
## [1] 10
# There are 10 outliers that might be influential on the regression
x[outlier_index_leverage, c("Track_Title","Track_Artist", "Genre", "Artist_Popularity", "Artist_Follower", "viewsCount")]
# Rule of thumb Cook's distance
# movement of regression coefficients all together if ith observation is excluded
outlier_index_cooksd <- as.numeric(rownames(x[cooks.distance(model3) > 3*(p+1)/n, ]))
length(outlier_index_cooksd)
## [1] 2
x[outlier_index_cooksd, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# same outliers detected as in model2 even after modification of the model
# differences
# SDBETA
SDBETA <- dfbetas(model3)
n <- nrow(model3$model)
par(mfcol=c(3,3))
plot(SDBETA[, 'fa_2'], main="fa_2", pch=19, ylab = "fa_2")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_2sq'], main="fa_2sq", pch=19, ylab = "fa_2sq")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'fa_3'], main="fa_3", pch=19, ylab = "fa_3")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_1'], main="pc_1", pch=19, ylab = "pc_1")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2'], main="pc_2", pch=19, ylab = "pc_2")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'pc_2sq'], main="pc_2sq", pch=19, ylab = "pc_2sq")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Follower'], main="Artist_Follower", pch=19, ylab = "Artist_Follower")
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'Artist_Followersq'], main="Artist_Followersq", pch=19, ylab = 'Artist_Followersq')
abline(h=c(-2,2)/sqrt(n), col="red")
plot(SDBETA[, 'days_release_orig'], main="days_release_orig", pch=19, ylab = 'days_release')
abline(h=c(-2,2)/sqrt(n), col="red")
# SDFITS
par(mfcol=c(1,1))
SDFITS <- dffits(model3)
plot(SDFITS, pch=19)
abline(h=c(-1,1), col="red")
n <- nrow(model3$model)
p <- ncol(model3$model)-1
abline(h=c(-2,2)*sqrt(p/n), col="darkred")
# As this is a small data set, it is sufficient to analyze |\delta*y_i| > 1, in this case two observations are identified
# of exceeding this limit. They are
outlier_index_sdfits <- as.numeric(names(which(abs(SDFITS) > 1)))
x[outlier_index_sdfits, c("Track_Title","Track_Artist", "Artist_Popularity", "Artist_Follower", "viewsCount", "pc_1",
"pc_2")]
# Now exclude the outliers from leverage analysis
x_noout <- x[-outlier_index_final, ]
lm_final <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x_noout)
summary(lm_final) # result: R^2 decreased a little bit
##
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 +
## pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig,
## data = x_noout)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.4824 -5.2041 -0.3971 4.9615 19.4089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.5778523 1.1556264 58.477 < 2e-16 ***
## fa_2 14.0294272 3.7773779 3.714 0.000274 ***
## fa_2sq -7.7026797 2.3813018 -3.235 0.001455 **
## fa_3 4.1812469 1.1144664 3.752 0.000238 ***
## pc_1 1.5664575 0.4198679 3.731 0.000257 ***
## pc_2 5.2986744 0.5665857 9.352 < 2e-16 ***
## pc_2sq -1.9828773 0.4235979 -4.681 5.68e-06 ***
## Artist_Follower 4.0311406 0.6231803 6.469 9.45e-10 ***
## Artist_Followersq -0.1626515 0.0347541 -4.680 5.70e-06 ***
## days_release_orig -0.0004396 0.0001751 -2.510 0.012974 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.855 on 176 degrees of freedom
## Multiple R-squared: 0.7448, Adjusted R-squared: 0.7317
## F-statistic: 57.06 on 9 and 176 DF, p-value: < 2.2e-16
vif(lm_final)
## fa_2 fa_2sq fa_3 pc_1
## 3.847406 3.207554 1.750377 2.249691
## pc_2 pc_2sq Artist_Follower Artist_Followersq
## 1.401325 1.436766 13.792858 12.676491
## days_release_orig
## 1.497164
# library("stargazer")
# stargazer::stargazer(model1, model2, model3, lm_final)
library("boot")
##
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:psych':
##
## logit
ci <- confint(lm_final, level = 0.95)
plot.confint <- function (b, b.boot, ci, main) {
hist(b.boot, main=main); rug(b.boot)
abline(v=b, col="red"); abline(v=ci, col="blue")
abline(v=quantile(b.boot, c(0.025, 0.975)), col="green")
}
betahat <- boot(lm_final$model, R=999,
function(x, index) { lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig, data = x_noout, subset=index)$coefficients })
par(mfcol=c(2,3))
plot.confint(lm_final$coefficients[1], betahat$t[,1], ci[1,], "Intercept")
plot.confint(lm_final$coefficients[2], betahat$t[,2], ci[2,], "fa_2")
plot.confint(lm_final$coefficients[3], betahat$t[,3], ci[3,], "fa_2sq")
plot.confint(lm_final$coefficients[4], betahat$t[,4], ci[4,], "fa_3")
plot.confint(lm_final$coefficients[5], betahat$t[,5], ci[5,], "pc_1")
plot.confint(lm_final$coefficients[6], betahat$t[,6], ci[6,], "pc_2")
par(mfcol=c(2,2))
plot.confint(lm_final$coefficients[7], betahat$t[,7], ci[7,], "pc_2sq")
plot.confint(lm_final$coefficients[8], betahat$t[,8], ci[8,], "Artist_Follower")
plot.confint(lm_final$coefficients[9], betahat$t[,9], ci[9,], "Artist_Followersq")
plot.confint(lm_final$coefficients[10], betahat$t[,10], ci[10,], "days_release")
model5 <- lm(Artist_Popularity ~ fa_2 + fa_2sq, data = x)
summary(model5)
##
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq, data = x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.506 -10.505 1.289 10.461 36.224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.5658 1.0458 63.649 < 2e-16 ***
## fa_2 18.0348 2.7106 6.654 2.88e-10 ***
## fa_2sq -1.3864 0.2653 -5.226 4.47e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.2 on 193 degrees of freedom
## Multiple R-squared: 0.2059, Adjusted R-squared: 0.1977
## F-statistic: 25.03 on 2 and 193 DF, p-value: 2.167e-10
vertex <- -model5$coefficients[2] / (2*model5$coefficients[3])
vertex
## fa_2
## 6.504143
(vertex <= max(x$fa_2)) & (vertex >= min(x$fa_2)) # so vertex is inside of range of Artist_Follower and there exists a quadratic component
## fa_2
## TRUE
# https://stats.idre.ucla.edu/r/faq/how-can-i-estimate-the-standard-error-of-transformed-regression-parameters-in-r-using-the-delta-method/
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1376936-testing-whether-to-include-a-squared-term
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1344089-how-to-find-the-turning-point-for-a-quadratic-function
# https://www.statalist.org/forums/forum/general-stata-discussion/general/1408413-significance-level-of-quadratic-term
# https://psych.unl.edu/psycrs/statpage/nonlin_eg.pdf ("true" quadratic component vs. skewed predictor)
library("msm") # equivalent to the nlcom function in Stata
##
## Attaching package: 'msm'
## The following object is masked from 'package:boot':
##
## cav
# z-value according to the Delta method
vertex / msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5)) # = 12.8421
## fa_2
## 12.8421
# CI according to Stata
vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5))
## fa_2
## 5.511479
vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model5), vcov(model5))
## fa_2
## 7.496808
n <- nrow(model5$model)
d <- length(model5$coefficients)
df <- n - d
# http://sia.webpopix.org/nonlinearRegression.html
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 3.270659e-10 < 0.05 <-- reject H0 that vertex is equal to zero
# vertex for fa_2
vertex <- -model3$coefficients[2] / (2*model3$coefficients[3])
vertex
## fa_2
## 5.528294
(vertex <= max(x$fa_2)) & (vertex >= min(x$fa_2))
## fa_2
## TRUE
z <- vertex / msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
z
## fa_2
## 8.814632
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x2 / (2 * x3), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
## fa_2
## 4.440892e-16
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 4.440892e-16 < 0.05 <-- reject H0 that vertex is equal to zero
# vertex for pc_2
vertex <- -model3$coefficients[6] / (2*model3$coefficients[7])
vertex
## pc_2
## 1.363773
(vertex <= max(x$pc_2)) & (vertex >= min(x$pc_2))
## pc_2
## TRUE
z <- vertex / msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
z
## pc_2
## 4.536158
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x6 / (2 * x7), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
## pc_2
## 5.125243e-06
# p_value = P(z > 1.959964) = 1-pt(z, df=df) = 5.125243e-06 < 0.05 <-- reject H0 that vertex is equal to zero
# vertex for Artist_Follower
vertex <- -model3$coefficients[8] / (2*model3$coefficients[9])
vertex
## Artist_Follower
## 35.51927
(vertex <= max(x$Artist_Follower)) & (vertex >= min(x$Artist_Follower))
## Artist_Follower
## TRUE
z <- vertex / msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
z
## Artist_Follower
## 9.652667
lower_CI <- vertex - qt(0.975, df = 10000000) * msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
upper_CI <- vertex + qt(0.975, df = 10000000) * msm::deltamethod(~ -x8 / (2 * x9), coef(model3), vcov(model3))
n <- nrow(model3$model)
d <- length(model3$coefficients)
df <- n - d
1-pt(z, df=df)
## Artist_Follower
## 0
#plot(Artist_Follower_trans, x$Artist_Popularity)
#min_max_normalize <- function(x)
#{
# return( (10-1)*((x- min(x)) /(max(x)-min(x))) + 1)
#}
#min_fa_2 <- optimize(ksD, c(-10,10), x=min_max_normalize(x$pc_2))$minimum
#fa_2bc <- bcPower(min_max_normalize(x$pc_2), min_fa_2)
#fa_2bcsqrr <- fa_2bc ** 0.5
#fa_2bcsqrrcenter <- fa_2bcsqrr - mean(fa_2bcsqrr)
#fa_2bcsqrrcentersq <- fa_2bcsqrrcenter ** 2
#cor(cbind(x$Artist_Popularity, fa_2bcsqrrcenter, fa_2bcsqrrcentersq))
#plot(x$fa_2, x$Artist_Popularity)
#plot(x$pc_2, x$Artist_Popularity)
# baseline model:
# forward
lmi <- lm(Artist_Popularity~1, data=lm_final$model)
lmf <- MASS::stepAIC(lmi, scope=~
fa_2
+ fa_2sq
+ fa_3
+ x_noout[,'fa_4']
+ pc_1
+ pc_2
+ pc_2sq
+ Artist_Follower
+ Artist_Followersq
+ days_release_orig
+ x_noout[, 'fa_1']
+ x_noout[, 'Track_Duration_ms']
+ x_noout[, 'Track_Popularity']
,direction = "forward")
## Start: AIC=1012.44
## Artist_Popularity ~ 1
##
## Df Sum of Sq RSS AIC
## + pc_2 1 17048.4 25493 919.20
## + x_noout[, "Track_Duration_ms"] 1 12472.7 30069 949.90
## + Artist_Follower 1 11518.9 31023 955.71
## + pc_2sq 1 10312.7 32229 962.81
## + fa_2 1 8694.3 33848 971.92
## + x_noout[, "Track_Popularity"] 1 7421.3 35120 978.79
## + Artist_Followersq 1 6625.6 35916 982.95
## + pc_1 1 1481.0 41061 1007.85
## + fa_2sq 1 1418.9 41123 1008.13
## <none> 42542 1012.44
## + days_release_orig 1 414.5 42127 1012.62
## + x_noout[, "fa_4"] 1 398.2 42144 1012.69
## + x_noout[, "fa_1"] 1 176.7 42365 1013.67
## + fa_3 1 133.5 42408 1013.86
##
## Step: AIC=919.2
## Artist_Popularity ~ pc_2
##
## Df Sum of Sq RSS AIC
## + Artist_Follower 1 7669.0 17824 854.64
## + Artist_Followersq 1 4882.1 20611 881.66
## + x_noout[, "Track_Duration_ms"] 1 4480.6 21013 885.25
## + fa_2 1 4209.3 21284 887.63
## + x_noout[, "Track_Popularity"] 1 3579.8 21914 893.06
## + pc_2sq 1 2528.4 22965 901.77
## + days_release_orig 1 2251.5 23242 904.00
## + pc_1 1 1731.7 23762 908.12
## + x_noout[, "fa_4"] 1 1088.3 24405 913.09
## + fa_2sq 1 856.0 24637 914.85
## + x_noout[, "fa_1"] 1 736.0 24757 915.75
## <none> 25493 919.20
## + fa_3 1 25.1 25468 921.02
##
## Step: AIC=854.64
## Artist_Popularity ~ pc_2 + Artist_Follower
##
## Df Sum of Sq RSS AIC
## + x_noout[, "Track_Popularity"] 1 3198.4 14626 819.85
## + x_noout[, "Track_Duration_ms"] 1 2853.7 14971 824.19
## + Artist_Followersq 1 2136.9 15688 832.89
## + pc_2sq 1 1360.1 16464 841.88
## + days_release_orig 1 1225.0 16599 843.40
## + fa_2 1 1157.3 16667 844.15
## + x_noout[, "fa_4"] 1 1050.6 16774 845.34
## + pc_1 1 843.0 16981 847.63
## <none> 17824 854.64
## + fa_2sq 1 96.1 17728 855.63
## + fa_3 1 71.3 17753 855.89
## + x_noout[, "fa_1"] 1 19.1 17805 856.44
##
## Step: AIC=819.85
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"]
##
## Df Sum of Sq RSS AIC
## + x_noout[, "Track_Duration_ms"] 1 1841.18 12785 796.83
## + Artist_Followersq 1 1806.26 12820 797.34
## + pc_2sq 1 1150.74 13475 806.61
## + days_release_orig 1 501.06 14125 815.37
## + pc_1 1 417.59 14208 816.47
## + x_noout[, "fa_4"] 1 282.55 14343 818.23
## + fa_3 1 198.73 14427 819.31
## <none> 14626 819.85
## + fa_2 1 142.21 14484 820.04
## + x_noout[, "fa_1"] 1 23.05 14603 821.56
## + fa_2sq 1 7.91 14618 821.75
##
## Step: AIC=796.83
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"]
##
## Df Sum of Sq RSS AIC
## + Artist_Followersq 1 1508.10 11277 775.48
## + pc_2sq 1 1241.24 11544 779.83
## + fa_3 1 664.00 12121 788.91
## + x_noout[, "fa_1"] 1 147.88 12637 796.67
## <none> 12785 796.83
## + fa_2 1 83.00 12702 797.62
## + days_release_orig 1 71.00 12714 797.79
## + pc_1 1 6.16 12779 798.74
## + x_noout[, "fa_4"] 1 5.55 12779 798.75
## + fa_2sq 1 4.04 12781 798.77
##
## Step: AIC=775.48
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq
##
## Df Sum of Sq RSS AIC
## + pc_2sq 1 1076.83 10200 758.82
## + fa_3 1 598.01 10679 767.35
## + fa_2sq 1 185.08 11092 774.40
## <none> 11277 775.48
## + x_noout[, "fa_1"] 1 84.76 11192 776.08
## + days_release_orig 1 36.02 11241 776.89
## + x_noout[, "fa_4"] 1 5.24 11272 777.40
## + pc_1 1 3.80 11273 777.42
## + fa_2 1 0.27 11276 777.48
##
## Step: AIC=758.82
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq
##
## Df Sum of Sq RSS AIC
## + fa_3 1 250.781 9949.1 756.19
## + fa_2sq 1 177.173 10022.7 757.56
## + days_release_orig 1 167.813 10032.1 757.73
## <none> 10199.9 758.82
## + pc_1 1 98.887 10101.0 759.00
## + x_noout[, "fa_1"] 1 81.693 10118.2 759.32
## + x_noout[, "fa_4"] 1 28.307 10171.6 760.30
## + fa_2 1 0.969 10198.9 760.80
##
## Step: AIC=756.19
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
## fa_3
##
## Df Sum of Sq RSS AIC
## + pc_1 1 546.47 9402.6 747.68
## + days_release_orig 1 302.11 9647.0 752.45
## + x_noout[, "fa_4"] 1 190.84 9758.3 754.58
## + fa_2sq 1 159.29 9789.8 755.18
## <none> 9949.1 756.19
## + x_noout[, "fa_1"] 1 1.79 9947.3 758.15
## + fa_2 1 0.74 9948.4 758.17
##
## Step: AIC=747.68
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
## fa_3 + pc_1
##
## Df Sum of Sq RSS AIC
## + fa_2sq 1 156.732 9245.9 746.55
## + days_release_orig 1 117.363 9285.3 747.34
## <none> 9402.6 747.68
## + x_noout[, "fa_1"] 1 25.394 9377.2 749.17
## + x_noout[, "fa_4"] 1 4.049 9398.6 749.60
## + fa_2 1 2.462 9400.2 749.63
##
## Step: AIC=746.55
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
## fa_3 + pc_1 + fa_2sq
##
## Df Sum of Sq RSS AIC
## + fa_2 1 191.308 9054.6 744.66
## + days_release_orig 1 105.159 9140.7 746.42
## <none> 9245.9 746.55
## + x_noout[, "fa_1"] 1 37.651 9208.2 747.79
## + x_noout[, "fa_4"] 1 0.048 9245.8 748.55
##
## Step: AIC=744.66
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
## fa_3 + pc_1 + fa_2sq + fa_2
##
## Df Sum of Sq RSS AIC
## + days_release_orig 1 109.025 8945.6 744.41
## <none> 9054.6 744.66
## + x_noout[, "fa_1"] 1 36.249 9018.3 745.92
## + x_noout[, "fa_4"] 1 0.008 9054.6 746.66
##
## Step: AIC=744.41
## Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[, "Track_Popularity"] +
## x_noout[, "Track_Duration_ms"] + Artist_Followersq + pc_2sq +
## fa_3 + pc_1 + fa_2sq + fa_2 + days_release_orig
##
## Df Sum of Sq RSS AIC
## <none> 8945.6 744.41
## + x_noout[, "fa_1"] 1 49.184 8896.4 745.38
## + x_noout[, "fa_4"] 1 6.072 8939.5 746.28
summary(lmf)
##
## Call:
## lm(formula = Artist_Popularity ~ pc_2 + Artist_Follower + x_noout[,
## "Track_Popularity"] + x_noout[, "Track_Duration_ms"] + Artist_Followersq +
## pc_2sq + fa_3 + pc_1 + fa_2sq + fa_2 + days_release_orig,
## data = lm_final$model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0380 -5.2954 0.3417 4.5413 17.3011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.987e+01 3.045e+00 19.664 < 2e-16 ***
## pc_2 4.357e+00 5.531e-01 7.877 3.48e-13 ***
## Artist_Follower 4.143e+00 5.731e-01 7.229 1.48e-11 ***
## x_noout[, "Track_Popularity"] 1.936e-01 4.086e-02 4.737 4.48e-06 ***
## x_noout[, "Track_Duration_ms"] -1.528e-05 4.591e-06 -3.329 0.001065 **
## Artist_Followersq -1.614e-01 3.182e-02 -5.071 1.01e-06 ***
## pc_2sq -1.667e+00 3.918e-01 -4.255 3.41e-05 ***
## fa_3 3.769e+00 1.024e+00 3.682 0.000308 ***
## pc_1 9.810e-01 4.271e-01 2.297 0.022806 *
## fa_2sq -5.659e+00 2.205e+00 -2.566 0.011127 *
## fa_2 7.210e+00 3.700e+00 1.948 0.052974 .
## days_release_orig -2.378e-04 1.633e-04 -1.456 0.147127
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.17 on 174 degrees of freedom
## Multiple R-squared: 0.7897, Adjusted R-squared: 0.7764
## F-statistic: 59.41 on 11 and 174 DF, p-value: < 2.2e-16
# backward:
lma <- lm(Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + pc_2 + pc_2sq + Artist_Follower
+ Artist_Followersq + fa_1 + Track_Duration_ms
+ days_release_orig + Track_Popularity,
data= x_noout)
lmb <- MASS::stepAIC(lma)
## Start: AIC=747.24
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 + pc_2 +
## pc_2sq + Artist_Follower + Artist_Followersq + fa_1 + Track_Duration_ms +
## days_release_orig + Track_Popularity
##
## Df Sum of Sq RSS AIC
## - fa_4 1 6.65 8896.4 745.38
## - fa_1 1 49.76 8939.5 746.28
## <none> 8889.7 747.24
## - days_release_orig 1 128.61 9018.3 747.92
## - fa_2 1 195.49 9085.2 749.29
## - pc_1 1 271.47 9161.2 750.84
## - fa_2sq 1 355.61 9245.3 752.54
## - Track_Duration_ms 1 536.66 9426.4 756.15
## - fa_3 1 615.81 9505.5 757.70
## - pc_2sq 1 830.66 9720.4 761.86
## - Track_Popularity 1 1166.70 10056.4 768.18
## - Artist_Followersq 1 1363.69 10253.4 771.79
## - pc_2 1 2572.29 11462.0 792.51
## - Artist_Follower 1 2719.78 11609.5 794.89
##
## Step: AIC=745.38
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq +
## Artist_Follower + Artist_Followersq + fa_1 + Track_Duration_ms +
## days_release_orig + Track_Popularity
##
## Df Sum of Sq RSS AIC
## - fa_1 1 49.18 8945.6 744.41
## <none> 8896.4 745.38
## - days_release_orig 1 121.96 9018.3 745.92
## - fa_2 1 193.77 9090.1 747.39
## - pc_1 1 295.87 9192.2 749.47
## - fa_2sq 1 349.34 9245.7 750.55
## - Track_Duration_ms 1 530.56 9426.9 754.16
## - fa_3 1 632.49 9528.9 756.16
## - pc_2sq 1 828.02 9724.4 759.94
## - Track_Popularity 1 1183.07 10079.4 766.61
## - Artist_Followersq 1 1358.68 10255.0 769.82
## - pc_2 1 2647.56 11543.9 791.84
## - Artist_Follower 1 2723.31 11619.7 793.06
##
## Step: AIC=744.41
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq +
## Artist_Follower + Artist_Followersq + Track_Duration_ms +
## days_release_orig + Track_Popularity
##
## Df Sum of Sq RSS AIC
## <none> 8945.6 744.41
## - days_release_orig 1 109.0 9054.6 744.66
## - fa_2 1 195.2 9140.7 746.42
## - pc_1 1 271.3 9216.8 747.97
## - fa_2sq 1 338.5 9284.1 749.32
## - Track_Duration_ms 1 569.7 9515.2 753.89
## - fa_3 1 697.0 9642.6 756.37
## - pc_2sq 1 930.7 9876.3 760.82
## - Track_Popularity 1 1153.8 10099.4 764.97
## - Artist_Followersq 1 1321.8 10267.4 768.04
## - Artist_Follower 1 2686.6 11632.1 791.26
## - pc_2 1 3190.0 12135.5 799.13
summary(lmb)
##
## Call:
## lm(formula = Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 +
## pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + Track_Duration_ms +
## days_release_orig + Track_Popularity, data = x_noout)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0380 -5.2954 0.3417 4.5413 17.3011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.987e+01 3.045e+00 19.664 < 2e-16 ***
## fa_2 7.210e+00 3.700e+00 1.948 0.052974 .
## fa_2sq -5.659e+00 2.205e+00 -2.566 0.011127 *
## fa_3 3.769e+00 1.024e+00 3.682 0.000308 ***
## pc_1 9.810e-01 4.271e-01 2.297 0.022806 *
## pc_2 4.357e+00 5.531e-01 7.877 3.48e-13 ***
## pc_2sq -1.667e+00 3.918e-01 -4.255 3.41e-05 ***
## Artist_Follower 4.143e+00 5.731e-01 7.229 1.48e-11 ***
## Artist_Followersq -1.614e-01 3.182e-02 -5.071 1.01e-06 ***
## Track_Duration_ms -1.528e-05 4.591e-06 -3.329 0.001065 **
## days_release_orig -2.378e-04 1.633e-04 -1.456 0.147127
## Track_Popularity 1.936e-01 4.086e-02 4.737 4.48e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.17 on 174 degrees of freedom
## Multiple R-squared: 0.7897, Adjusted R-squared: 0.7764
## F-statistic: 59.41 on 11 and 174 DF, p-value: < 2.2e-16
# Regularization
sel.var <- c("fa_1", "fa_2", "fa_2sq", "fa_3", "fa_4", "pc_1", "pc_2", "pc_2sq", "Artist_Follower", "Artist_Followersq", "Artist_Popularity", "days_release_orig", "Track_Popularity", "Track_Duration_ms")
df <- dplyr::select(x_noout, sel.var)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(sel.var)` instead of `sel.var` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
lmfull <- lm(Artist_Popularity~., data=df)
lmback <- step(lmfull)
## Start: AIC=747.24
## Artist_Popularity ~ fa_1 + fa_2 + fa_2sq + fa_3 + fa_4 + pc_1 +
## pc_2 + pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig +
## Track_Popularity + Track_Duration_ms
##
## Df Sum of Sq RSS AIC
## - fa_4 1 6.65 8896.4 745.38
## - fa_1 1 49.76 8939.5 746.28
## <none> 8889.7 747.24
## - days_release_orig 1 128.61 9018.3 747.92
## - fa_2 1 195.49 9085.2 749.29
## - pc_1 1 271.47 9161.2 750.84
## - fa_2sq 1 355.61 9245.3 752.54
## - Track_Duration_ms 1 536.66 9426.4 756.15
## - fa_3 1 615.81 9505.5 757.70
## - pc_2sq 1 830.66 9720.4 761.86
## - Track_Popularity 1 1166.70 10056.4 768.18
## - Artist_Followersq 1 1363.69 10253.4 771.79
## - pc_2 1 2572.29 11462.0 792.51
## - Artist_Follower 1 2719.78 11609.5 794.89
##
## Step: AIC=745.38
## Artist_Popularity ~ fa_1 + fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 +
## pc_2sq + Artist_Follower + Artist_Followersq + days_release_orig +
## Track_Popularity + Track_Duration_ms
##
## Df Sum of Sq RSS AIC
## - fa_1 1 49.18 8945.6 744.41
## <none> 8896.4 745.38
## - days_release_orig 1 121.96 9018.3 745.92
## - fa_2 1 193.77 9090.1 747.39
## - pc_1 1 295.87 9192.2 749.47
## - fa_2sq 1 349.34 9245.7 750.55
## - Track_Duration_ms 1 530.56 9426.9 754.16
## - fa_3 1 632.49 9528.9 756.16
## - pc_2sq 1 828.02 9724.4 759.94
## - Track_Popularity 1 1183.07 10079.4 766.61
## - Artist_Followersq 1 1358.68 10255.0 769.82
## - pc_2 1 2647.56 11543.9 791.84
## - Artist_Follower 1 2723.31 11619.7 793.06
##
## Step: AIC=744.41
## Artist_Popularity ~ fa_2 + fa_2sq + fa_3 + pc_1 + pc_2 + pc_2sq +
## Artist_Follower + Artist_Followersq + days_release_orig +
## Track_Popularity + Track_Duration_ms
##
## Df Sum of Sq RSS AIC
## <none> 8945.6 744.41
## - days_release_orig 1 109.0 9054.6 744.66
## - fa_2 1 195.2 9140.7 746.42
## - pc_1 1 271.3 9216.8 747.97
## - fa_2sq 1 338.5 9284.1 749.32
## - Track_Duration_ms 1 569.7 9515.2 753.89
## - fa_3 1 697.0 9642.6 756.37
## - pc_2sq 1 930.7 9876.3 760.82
## - Track_Popularity 1 1153.8 10099.4 764.97
## - Artist_Followersq 1 1321.8 10267.4 768.04
## - Artist_Follower 1 2686.6 11632.1 791.26
## - pc_2 1 3190.0 12135.5 799.13
library("glmnet")
## Loading required package: Matrix
## Loaded glmnet 3.0-2
xl <- as.matrix(df[,-11])
yl <- df[,11]
set.seed(42)
lmlasso <- cv.glmnet(xl, yl, alpha =1) # cross-validation finds the value of lambda which minimizes MSE, alpha = 1 for LASSO
plot(lmlasso) # displays the bias-variance trade-off (MSE = variance + bias^2)
best.lambda <- lmlasso$lambda.min
lmlasso$lambda.1se
## [1] 0.3361553
cl <- list(coef(lmlasso, s="lambda.min")[,1], coef(lmlasso, s="lambda.1se")[,1], coef(lmback), coef(lmfull))
cf <- matrix(NA, nrow=dim(xl)[2]+1, ncol=length(cl))
row.names(cf) <- names(lmfull$coefficients)
i <- 1
for (ci in cl) {
cf[names(ci),i] <- ci[names(ci)]
i <- i+1
}
colnames(cf) <- c("min", "1se", "back", "full")
cf[cf==0] <- NA
cf
## min 1se back full
## (Intercept) 6.040412e+01 6.114886e+01 5.987096e+01 5.933345e+01
## fa_1 -4.950318e-02 NA NA -8.893245e-01
## fa_2 5.222430e+00 2.626698e+00 7.209531e+00 7.217918e+00
## fa_2sq -3.457641e+00 -5.763024e-01 -5.658801e+00 -5.837164e+00
## fa_3 3.323039e+00 2.638256e+00 3.769373e+00 4.505508e+00
## fa_4 NA NA NA 3.733388e-01
## pc_1 8.325534e-01 6.300647e-01 9.809889e-01 1.111732e+00
## pc_2 4.444693e+00 4.581303e+00 4.356624e+00 4.151896e+00
## pc_2sq -1.666188e+00 -1.671870e+00 -1.667241e+00 -1.600879e+00
## Artist_Follower 3.098932e+00 1.731344e+00 4.143098e+00 4.255461e+00
## Artist_Followersq -1.000969e-01 -2.026752e-02 -1.613611e-01 -1.647535e-01
## days_release_orig -2.334705e-04 -2.257810e-04 -2.377630e-04 -2.659810e-04
## Track_Popularity 1.908254e-01 1.868568e-01 1.935870e-01 1.997832e-01
## Track_Duration_ms -1.648856e-05 -1.812532e-05 -1.528159e-05 -1.495624e-05
# variable importance
glmmod <- glmnet(xl, yl, lambda = best.lambda)
coefs = coef(glmmod)[,1]
coefs = sort(abs(coefs), decreasing = T)
coefs
## (Intercept) fa_2 pc_2 fa_2sq
## 6.041106e+01 5.233869e+00 4.444640e+00 3.461076e+00
## fa_3 Artist_Follower pc_2sq pc_1
## 3.325658e+00 3.096916e+00 1.666081e+00 8.321862e-01
## Track_Popularity Artist_Followersq fa_1 days_release_orig
## 1.907631e-01 9.999550e-02 5.110609e-02 2.337083e-04
## Track_Duration_ms fa_4
## 1.649063e-05 0.000000e+00
coef(lmlasso, s="lambda.1se")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 6.114886e+01
## fa_1 .
## fa_2 2.626698e+00
## fa_2sq -5.763024e-01
## fa_3 2.638256e+00
## fa_4 .
## pc_1 6.300647e-01
## pc_2 4.581303e+00
## pc_2sq -1.671870e+00
## Artist_Follower 1.731344e+00
## Artist_Followersq -2.026752e-02
## days_release_orig -2.257810e-04
## Track_Popularity 1.868568e-01
## Track_Duration_ms -1.812532e-05
######################
# Performance metrics - full data
performace_metrics <- matrix(nrow = 10, ncol = 6)
RMSE(predict(lmfull, newdata = df), df$Artist_Popularity)
## [1] 6.913336
R2(predict(lmfull, newdata = df), df$Artist_Popularity)
## [1] 0.7910356
performace_metrics[1,1] <- RMSE(predict(lmfull, newdata = df), df$Artist_Popularity)
performace_metrics[1,2] <- R2(predict(lmfull, newdata = df), df$Artist_Popularity)
# x_model <- model.matrix(Artist_Popularity~., x_noout)
library("glmnet")
"Ridge regression"
## [1] "Ridge regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0)$lambda.min
ridge.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0, lambda = best.lambda)
# as.numeric(coef(ridge.cv)[,1]) %*% as.numeric(x_model[1,]) equivalent
# predict(ridge.cv, newx = as.matrix(x_noout[1,-8])) equivalent
RMSE(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 7.239088
R2(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## s0
## [1,] 0.7724354
performace_metrics[2,1] <- RMSE(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[2,2] <- R2(predict(ridge.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
"LASSO regression"
## [1] "LASSO regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 1)$lambda.min
lasso.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 1, lambda = best.lambda)
RMSE(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 7.029595
R2(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## s0
## [1,] 0.7847568
performace_metrics[3,1] <- RMSE(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[3,2] <- R2(predict(lasso.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
"Elasticnet regression"
## [1] "Elasticnet regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0.5)$lambda.min
elasticnet.cv <- glmnet(x = as.matrix(df[,-11]), y = as.matrix(df[,11]), alpha = 0.5, lambda = best.lambda)
RMSE(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 7.043502
R2(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## s0
## [1,] 0.7839069
performace_metrics[4,1] <- RMSE(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[4,2] <- R2(predict(elasticnet.cv, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
x_full <- model.matrix(Artist_Popularity~., df)[,-1]
ridge_res2 <- as.numeric(df$Artist_Popularity - predict(ridge.cv, x_full))^2
lasso_res2 <- as.numeric(df$Artist_Popularity - predict(lasso.cv, x_full))^2
elasticnet_res2 <- as.numeric(df$Artist_Popularity - predict(elasticnet.cv, x_full))^2
par(mfcol=c(2,2))
barplot((df$Artist_Popularity - predict(lmfull, df))^2)
Axis(side=2)
title(main = paste("LM (full sample): RMSE =",round(RMSE(predict(lmfull, df), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lmfull, df), df$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (full sample): RMSE =",round(RMSE(predict(ridge.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_full), df$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (full sample): RMSE =",round(RMSE(predict(lasso.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_full), df$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (full sample): RMSE =",round(RMSE(predict(elasticnet.cv, x_full), df$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_full), df$Artist_Popularity), digits = 2)))
library("caret")
# Splitting in train and test data
set.seed(42)
idx.train <- createDataPartition(y = df$Artist_Popularity, p = 0.75, list = FALSE)
train <- df[idx.train, ]
test <- df[-idx.train,]
lm_train <- lm(Artist_Popularity~., data=train)
RMSE(predict(lm_train, newdata = train), train$Artist_Popularity)
## [1] 6.835059
R2(predict(lm_train, newdata = train), train$Artist_Popularity)
## [1] 0.7999369
performace_metrics[1,3] <- RMSE(predict(lm_train, newdata = train), train$Artist_Popularity)
performace_metrics[1,4] <- R2(predict(lm_train, newdata = train), train$Artist_Popularity)
RMSE(predict(lm_train, newdata = test), test$Artist_Popularity)
## [1] 9.279663
R2(predict(lm_train, newdata = test), test$Artist_Popularity)
## [1] 0.6328343
performace_metrics[1,5] <- RMSE(predict(lm_train, newdata = test), test$Artist_Popularity)
performace_metrics[1,6] <- R2(predict(lm_train, newdata = test), test$Artist_Popularity)
"Ridge regression"
## [1] "Ridge regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0)$lambda.min
ridge.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0, lambda = best.lambda)
# as.numeric(coef(ridge.cv)[,1]) %*% as.numeric(x_model[1,]) equivalent
# predict(ridge.cv, newx = as.matrix(x_noout[1,-8])) equivalent
RMSE(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## [1] 7.209575
R2(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## s0
## [1,] 0.7787364
performace_metrics[2,3] <- RMSE(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[2,4] <- R2(predict(ridge.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
RMSE(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## [1] 8.186115
R2(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## s0
## [1,] 0.7030654
performace_metrics[2,5] <- RMSE(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[2,6] <- R2(predict(ridge.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
"LASSO regression"
## [1] "LASSO regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 1)$lambda.min
lasso.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 1, lambda = best.lambda)
RMSE(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## [1] 6.940459
R2(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## s0
## [1,] 0.7944228
performace_metrics[3,3] <- RMSE(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[3,4] <- R2(predict(lasso.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
RMSE(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## [1] 7.695779
R2(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## s0
## [1,] 0.7335293
performace_metrics[3,5] <- RMSE(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[3,6] <- R2(predict(lasso.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
"Elasticnet regression"
## [1] "Elasticnet regression"
set.seed(42)
best.lambda <- cv.glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0.5)$lambda.min
elasticnet.cv <- glmnet(x = as.matrix(train[,-11]), y = as.matrix(train[,11]), alpha = 0.5, lambda = best.lambda)
RMSE(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## [1] 6.935805
R2(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
## s0
## [1,] 0.7945423
performace_metrics[4,3] <- RMSE(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
performace_metrics[4,4] <- R2(predict(elasticnet.cv, newx = as.matrix(train[,-11])), as.matrix(train[,11]))
RMSE(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## [1] 8.055396
R2(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
## s0
## [1,] 0.7106164
performace_metrics[4,5] <- RMSE(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
performace_metrics[4,6] <- R2(predict(elasticnet.cv, newx = as.matrix(test[,-11])), as.matrix(test[,11]))
# Graphics
# Training data
x_train <- model.matrix(Artist_Popularity~., train)[,-1]
ridge_res2 <- as.numeric(train$Artist_Popularity - predict(ridge.cv, x_train))^2
lasso_res2 <- as.numeric(train$Artist_Popularity - predict(lasso.cv, x_train))^2
elasticnet_res2 <- as.numeric(train$Artist_Popularity - predict(elasticnet.cv, x_train))^2
par(mfcol=c(2,2))
barplot((train$Artist_Popularity - predict(lm_train, train))^2, main = paste("LM (train): RMSE =",round(RMSE(predict(lm_train, train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lm_train, train), train$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (train): RMSE =",round(RMSE(predict(ridge.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_train), train$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (train): RMSE =",round(RMSE(predict(lasso.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_train), train$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (train): RMSE =",round(RMSE(predict(elasticnet.cv, x_train), train$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_train), train$Artist_Popularity), digits = 2)))
# Test data
x_test <- model.matrix(Artist_Popularity~., test)[,-1]
ridge_res2 <- as.numeric(test$Artist_Popularity - predict(ridge.cv, x_test))^2
lasso_res2 <- as.numeric(test$Artist_Popularity - predict(lasso.cv, x_test))^2
elasticnet_res2 <- as.numeric(test$Artist_Popularity - predict(elasticnet.cv, x_test))^2
par(mfcol=c(2,2))
barplot((test$Artist_Popularity - predict(lm_train, test))^2, main = paste("LM (test): RMSE =",round(RMSE(predict(lm_train, test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lm_train, test), test$Artist_Popularity), digits = 2)))
barplot(ridge_res2, main = paste("Ridge (test): RMSE =",round(RMSE(predict(ridge.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(ridge.cv, x_test), test$Artist_Popularity), digits = 2)))
barplot(lasso_res2, main = paste("LASSO (test): RMSE =",round(RMSE(predict(lasso.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(lasso.cv, x_test), test$Artist_Popularity), digits = 2)))
barplot(elasticnet_res2, main = paste("Elasticnet (test): RMSE =",round(RMSE(predict(elasticnet.cv, x_test), test$Artist_Popularity), digits = 2), ", R^2:", round(R2(predict(elasticnet.cv, x_test), test$Artist_Popularity), digits = 2)))
# Non-parametric and semiparametric regression
# Kernel density estimator
library("np")
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-10)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
bw <- npudensbw(~Track_Duration_ms, data=df) # Track_Duration_ms bi-modal?
##
Multistart 1 of 1 |
Multistart 1 of 1 |
Multistart 1 of 1 |
Multistart 1 of 1 /
Multistart 1 of 1 |
Multistart 1 of 1 |
fhat <- npudens(bw)
fhat
##
## Density Data: 186 training points, in 1 variable(s)
## Track_Duration_ms
## Bandwidth(s): 25427.78
##
## Bandwidth Type: Fixed
## Log Likelihood: -2404.132
##
## Continuous Kernel Type: Second-Order Gaussian
## No. Continuous Vars.: 1
plot(fhat, main=sprintf("%s with h=%.2f", fhat$pckertype, fhat$bw))
rug(df$Track_Duration_ms)
bw <- npudensbw(~Artist_Popularity+Track_Duration_ms, data=df, bwmethod="normal-reference")
fhat <- npudens(bw)
# plot(fhat) # bi-modal
par(mfcol=c(1,1))
#1) at Track_Duration_ms ~ 1 and Artist_Popularity ~ -1 (meaning longer tracks' artists are less popular)
plot(fhat, view="fixed", phi=10, theta=320, main = "")
par(mfcol=c(1,1))
# 2) at Track_Duration_ms ~ -0.5 and Artist_Popularity ~ 1
plot(fhat, view="fixed", phi=10, theta=155, main = "")
par(mfcol=c(1,1))
nw2 <- npreg(Artist_Popularity~pc_2+Track_Duration_ms, data=df)
##
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 1 of 2 /
Multistart 1 of 2 -
Multistart 1 of 2 |
Multistart 1 of 2 |
Multistart 2 of 2 |
Multistart 2 of 2 |
Multistart 2 of 2 /
Multistart 2 of 2 -
Multistart 2 of 2 |
Multistart 2 of 2 |
summary(nw2)
##
## Regression Data: 186 training points, in 2 variable(s)
## pc_2 Track_Duration_ms
## Bandwidth(s): 0.6558287 64051.76
##
## Kernel Regression Estimator: Local-Constant
## Bandwidth Type: Fixed
## Residual standard error: 9.136543
## R-squared: 0.6351856
##
## Continuous Kernel Type: Second-Order Gaussian
## No. Continuous Explanatory Vars.: 2
# plot(nw2)
# Nadaraya-Watson estimator
# bw <- npregbw(Artist_Popularity~Track_Duration_ms, data=df)
# mhat <- npreg(bw)
# main <- sprintf("%s with h=%.2f", mhat$pckertype, mhat$bw)
# plot(df$Artist_Popularity, df$Track_Duration_ms, pch=19, cex=0.3, main=main)
# ind <- order(df$Track_Duration_ms)
# xs <- cbind(df$Track_Duration_ms, fitted(mhat))[ind,]
# lines(xs[,1], xs[,2], col="red", lwd=2)
# rug(df$Track_Duration_ms)
# bw <- npregbw(Artist_Popularity~Track_Duration_ms+Track_Popularity, data=df)
# mhat <- npreg(bw)
# plot(mhat)
plotContour <- function (model, data, n=30) {
mf <- model.frame(model$terms, data)
mc <- lapply(as.list(mf[,-1]), pretty, n=n)
zc <- predict(model, expand.grid(mc))
dim(zc) <- sapply(mc, length)
r2 <- 1-var(residuals(model))/var(mf[,1])
contour(mc[[1]], mc[[2]], zc, xlab=names(mf)[2], ylab=names(mf)[3],
main=sprintf("R^2=%.3f", r2))
cc <- gray(0.75-0.75*(mf[,1]-min(mf[,1]))/(max(mf[,1])-min(mf[,1])))
points(mf[,2], mf[,3], pch=19, cex=0.5, col=cc)
}
model <- lm(Artist_Popularity~pc_1 + pc_2, data=df)
par(mfrow=c(1,1))
plotContour(model, df)
# Partial linear model (mixing linear and spline terms)
library("mgcv")
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:ryouready':
##
## intervals
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
## The following object is masked from 'package:mclust':
##
## mvn
## The following object is masked from 'package:MVN':
##
## mvn
model <- gam(Artist_Popularity~s(Track_Popularity)+Track_Duration_ms, data=df)
# plotContour(model, df$Track_Popularity, df$Track_Duration_ms, df$Artist_Popularity)
# plot(model)
# Additive model
library("gam")
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.16.1
##
## Attaching package: 'gam'
## The following objects are masked from 'package:mgcv':
##
## gam, gam.control, gam.fit, s
am1 <- gam(Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + s(pc_2sq) + s(Artist_Followersq), data=df)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
print(am1)
## Call:
## gam(formula = Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) +
## s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) +
## s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) +
## s(pc_2sq) + s(Artist_Followersq), data = df)
##
## Degrees of Freedom: 185 total; 76.48856 Residual
## Residual Deviance: 2110.514
par(mfrow=c(3,4))
plot(am1)
plot(fitted(am1), residuals(am1))
lines(lowess(fitted(am1), residuals(am1)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(am1)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9503897
performace_metrics[5,1] <- RMSE(predict(am1, newdata = df), df$Artist_Popularity)
performace_metrics[5,2] <- R2(predict(am1, newdata = df), df$Artist_Popularity)
am1_train <- gam(Artist_Popularity ~ s(fa_1) + s(fa_2) + s(fa_3) + s(fa_4) + s(pc_1) + s(pc_2) + s(Artist_Follower) + s(days_release_orig) + s(Track_Popularity) + s(Track_Duration_ms) + s(fa_2sq) + s(pc_2sq) + s(Artist_Followersq), data=train)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
performace_metrics[5,3] <- RMSE(predict(am1_train, newdata = train), train$Artist_Popularity)
performace_metrics[5,4] <- R2(predict(am1_train, newdata = train), train$Artist_Popularity)
performace_metrics[5,5] <- RMSE(predict(am1_train, newdata = test), test$Artist_Popularity)
performace_metrics[5,6] <- R2(predict(am1_train, newdata = test), test$Artist_Popularity)
# Single index model
sim <- npindex(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=df)
## Multistart 1 of 5...Multistart 2 of 5...Multistart 3 of 5...Multistart 4 of 5...Multistart 5 of 5...
summary(sim)
##
## Single Index Model
## Regression Data: 186 training points, in 13 variable(s)
##
## fa_1 fa_2 fa_3 fa_4 pc_1 pc_2 Artist_Follower
## Beta: 1 7.140808 -0.9481839 1.320797 0.3963661 1.316466 10.62225
## days_release_orig Track_Popularity Track_Duration_ms fa_2sq pc_2sq
## Beta: -4.818576e-05 0.08091355 -7.782318e-06 -7.679957 -0.8994251
## Artist_Followersq
## Beta: -0.1862698
## Bandwidth: 0.8377536
## Kernel Regression Estimator: Local-Constant
##
## Residual standard error: 5.506267
## R-squared: 0.8675706
##
## Continuous Kernel Type: Second-Order Gaussian
## No. Continuous Explanatory Vars.: 1
performace_metrics[6,1] <- RMSE(predict(sim, newdata = df), df$Artist_Popularity)
performace_metrics[6,2] <- R2(predict(sim, newdata = df), df$Artist_Popularity)
sim_train <- npindex(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=train)
## Multistart 1 of 5...Multistart 2 of 5...Multistart 3 of 5...Multistart 4 of 5...Multistart 5 of 5...
performace_metrics[6,3] <- RMSE(predict(sim_train, newdata = train), train$Artist_Popularity)
performace_metrics[6,4] <- R2(predict(sim_train, newdata = train), train$Artist_Popularity)
performace_metrics[6,5] <- RMSE(predict(sim_train, newdata = test), test$Artist_Popularity)
performace_metrics[6,6] <- R2(predict(sim_train, newdata = test), test$Artist_Popularity)
par(mfrow=c(3,4))
plot(sim)
plot(fitted(sim), residuals(sim))
lines(lowess(fitted(sim), residuals(sim)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(sim)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.8674403
# Projection Pursuit Regression
ppr <- ppr(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=df, nterm=5)
performace_metrics[7,1] <- RMSE(predict(ppr, newdata = df), df$Artist_Popularity)
performace_metrics[7,2] <- R2(predict(ppr, newdata = df), df$Artist_Popularity)
ppr_train <- ppr(Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 + pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity + Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq, data=train, nterm=5)
performace_metrics[7,3] <- RMSE(predict(ppr_train, newdata = train), train$Artist_Popularity)
performace_metrics[7,4] <- R2(predict(ppr_train, newdata = train), train$Artist_Popularity)
performace_metrics[7,5] <- RMSE(predict(ppr_train, newdata = test), test$Artist_Popularity)
performace_metrics[7,6] <- R2(predict(ppr_train, newdata = test), test$Artist_Popularity)
summary(ppr)
## Call:
## ppr(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 +
## pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity +
## Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq,
## data = df, nterm = 5)
##
## Goodness of fit:
## 5 terms
## 1741.435
##
## Projection direction vectors ('alpha'):
## term 1 term 2 term 3 term 4
## fa_1 -6.325427e-02 2.081791e-01 -8.112105e-02 4.462995e-01
## fa_2 1.999522e-01 1.988317e-01 3.820633e-01 -1.225150e-01
## fa_3 1.776410e-01 -4.813375e-01 4.625303e-01 -2.615833e-01
## fa_4 9.095299e-02 -7.325872e-01 5.161008e-01 -2.094942e-01
## pc_1 5.912856e-02 -1.634530e-01 -1.398765e-01 -2.846767e-02
## pc_2 7.261656e-02 1.218474e-01 -3.230672e-01 -5.558153e-01
## Artist_Follower 9.472078e-01 -2.009118e-01 1.651405e-01 5.085105e-01
## days_release_orig 5.284337e-07 -2.107916e-05 5.900920e-05 2.729395e-05
## Track_Popularity 6.925853e-03 1.765052e-02 -6.223949e-02 2.066660e-03
## Track_Duration_ms -3.916723e-07 4.233016e-07 2.333658e-06 -2.253623e-06
## fa_2sq -9.559306e-02 -2.453111e-01 5.941416e-02 -1.868207e-01
## pc_2sq -7.315117e-03 -7.878193e-02 4.566070e-01 -2.623794e-01
## Artist_Followersq -3.126859e-02 1.152058e-02 -7.044398e-03 -3.768289e-02
## term 5
## fa_1 -6.211143e-01
## fa_2 -1.261187e-01
## fa_3 -2.121784e-01
## fa_4 4.149667e-01
## pc_1 -2.597611e-01
## pc_2 4.152905e-01
## Artist_Follower -1.891676e-01
## days_release_orig -7.811224e-05
## Track_Popularity -2.572436e-03
## Track_Duration_ms -2.907082e-06
## fa_2sq -1.176104e-01
## pc_2sq 3.024768e-01
## Artist_Followersq 6.051800e-03
##
## Coefficients of ridge terms ('beta'):
## term 1 term 2 term 3 term 4 term 5
## 15.436585 3.228392 3.418258 2.647310 2.190618
par(mfrow=c(1,1))
plot(ppr)
plot(fitted(ppr), residuals(ppr))
lines(lowess(fitted(ppr), residuals(ppr)), col="red", lwd=2)
n <- nrow(df)
1-sum(residuals(ppr)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9590653
library("rpart")
set.seed(42)
tr1 <- rpart(Artist_Popularity~fa_1+fa_2+fa_3+fa_4+pc_1+pc_2+Artist_Follower+days_release_orig
+Track_Popularity+Track_Duration_ms+fa_2sq+pc_2sq+Artist_Followersq, data=df, cp = 0)
tr1
## n= 186
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 186 42541.81000 64.03226
## 2) Artist_Follower< 0.069933 69 6966.63800 49.07246
## 4) fa_2sq>=0.04873442 44 1927.88600 43.65909
## 8) Artist_Follower< 0.0063065 24 493.33330 39.33333
## 16) Track_Popularity< 42.5 7 246.85710 36.14286 *
## 17) Track_Popularity>=42.5 17 145.88240 40.64706 *
## 9) Artist_Follower>=0.0063065 20 446.55000 48.85000
## 18) Artist_Follower< 0.0181555 7 27.42857 45.71429 *
## 19) Artist_Follower>=0.0181555 13 313.23080 50.53846 *
## 5) fa_2sq< 0.04873442 25 1480.00000 58.60000
## 10) Track_Duration_ms>=322237 11 247.63640 53.81818 *
## 11) Track_Duration_ms< 322237 14 783.21430 62.35714 *
## 3) Artist_Follower>=0.069933 117 11026.53000 72.85470
## 6) Artist_Follower< 0.538149 59 2850.67800 66.23729
## 12) Track_Duration_ms>=327871 16 283.00000 59.25000 *
## 13) Track_Duration_ms< 327871 43 1495.86000 68.83721
## 26) Track_Popularity< 66 33 762.90910 66.81818
## 52) fa_1< -0.7142649 12 117.66670 64.16667 *
## 53) fa_1>=-0.7142649 21 512.66670 68.33333
## 106) Track_Duration_ms>=208616 7 77.71429 65.42857 *
## 107) Track_Duration_ms< 208616 14 346.35710 69.78571 *
## 27) Track_Popularity>=66 10 154.50000 75.50000 *
## 7) Artist_Follower>=0.538149 58 2964.06900 79.58621
## 14) Artist_Follower< 2.122275 41 1084.04900 76.26829
## 28) Track_Popularity< 59 25 470.96000 73.96000
## 56) Artist_Follower< 0.779791 11 144.54550 71.36364 *
## 57) Artist_Follower>=0.779791 14 194.00000 76.00000 *
## 29) Track_Popularity>=59 16 271.75000 79.87500 *
## 15) Artist_Follower>=2.122275 17 340.11760 87.58824 *
par(mfrow=c(1,1))
plot(tr1)
text(tr1)
# summary(tr1)
library("rpart.plot")
rpart.plot(tr1, cex = 0.7)
n <- nrow(x_noout)
1-sum(residuals(tr1)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9131701
RMSE(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 4.45642
R2(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [,1]
## [1,] 0.9131701
performace_metrics[8,1] <- RMSE(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
performace_metrics[8,2] <- R2(predict(tr1, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
par(mfrow=c(1,1))
printcp(tr1)
##
## Regression tree:
## rpart(formula = Artist_Popularity ~ fa_1 + fa_2 + fa_3 + fa_4 +
## pc_1 + pc_2 + Artist_Follower + days_release_orig + Track_Popularity +
## Track_Duration_ms + fa_2sq + pc_2sq + Artist_Followersq,
## data = df, cp = 0)
##
## Variables actually used in tree construction:
## [1] Artist_Follower fa_1 fa_2sq Track_Duration_ms
## [5] Track_Popularity
##
## Root node error: 42542/186 = 228.72
##
## n= 186
##
## CP nsplit rel error xerror xstd
## 1 0.5770474 0 1.000000 1.00653 0.083049
## 2 0.1225097 1 0.422953 0.48655 0.048436
## 3 0.0836530 2 0.300443 0.40487 0.043226
## 4 0.0361974 3 0.216790 0.31837 0.031836
## 5 0.0251945 4 0.180592 0.25499 0.027825
## 6 0.0232243 5 0.155398 0.25364 0.027712
## 7 0.0135972 6 0.132174 0.22116 0.026618
## 8 0.0105578 7 0.118577 0.20613 0.021909
## 9 0.0080236 8 0.108019 0.20183 0.021094
## 10 0.0031164 9 0.099995 0.18382 0.019763
## 11 0.0031126 10 0.096879 0.17620 0.018307
## 12 0.0024891 11 0.093766 0.17494 0.018228
## 13 0.0023646 12 0.091277 0.17700 0.018198
## 14 0.0020825 13 0.088912 0.17707 0.018191
## 15 0.0000000 14 0.086830 0.17756 0.018387
plotcp(tr1)
tr1$cptable
## CP nsplit rel error xerror xstd
## 1 0.577047401 0 1.00000000 1.0065265 0.08304941
## 2 0.122509677 1 0.42295260 0.4865458 0.04843559
## 3 0.083653037 2 0.30044292 0.4048674 0.04322625
## 4 0.036197394 3 0.21678988 0.3183658 0.03183584
## 5 0.025194452 4 0.18059249 0.2549941 0.02782496
## 6 0.023224285 5 0.15539804 0.2536358 0.02771156
## 7 0.013597245 6 0.13217375 0.2211631 0.02661788
## 8 0.010557834 7 0.11857651 0.2061339 0.02190935
## 9 0.008023608 8 0.10801867 0.2018287 0.02109449
## 10 0.003116364 9 0.09999507 0.1838249 0.01976306
## 11 0.003112575 10 0.09687870 0.1761958 0.01830748
## 12 0.002489096 11 0.09376613 0.1749360 0.01822793
## 13 0.002364588 12 0.09127703 0.1770043 0.01819798
## 14 0.002082545 13 0.08891244 0.1770745 0.01819139
## 15 0.000000000 14 0.08682990 0.1775587 0.01838699
best.cp <- tr1$cptable[which.min(tr1$cptable[,"xerror"]),"CP"]
best.cp
## [1] 0.002489096
dt_pruned <- prune(tr1, cp=best.cp)
1-sum(residuals(dt_pruned)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 0.9062339
RMSE(predict(dt_pruned, newx = as.matrix(df[,-11])), as.matrix(df[,11]))
## [1] 4.630997
R2(predict(dt_pruned, newx = as.matrix(df[,-11])), as.matrix(df[,11])) # lower R^2 after pruning
## [,1]
## [1,] 0.9062339
library("caret")
# DT CV on full data
caret.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Artist_Popularity ~ .,
data = df,
method = "rpart",
trControl = caret.control,
tuneLength = 15)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpart.cv
## CART
##
## 186 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 168, 167, 168, 168, 167, 167, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.000000000 6.074712 0.8414403 4.810231
## 0.002082545 6.068751 0.8423901 4.800191
## 0.002364588 6.052218 0.8437084 4.782182
## 0.002489096 6.053917 0.8434435 4.782970
## 0.003112575 6.044344 0.8438105 4.745985
## 0.003116364 6.044344 0.8438105 4.745985
## 0.008023608 6.226200 0.8325732 4.866355
## 0.010557834 6.403122 0.8199671 5.034487
## 0.013597245 6.553546 0.8110770 5.189022
## 0.023224285 7.292639 0.7677304 5.825220
## 0.025194452 7.438172 0.7583776 5.953699
## 0.036197394 8.040694 0.7224670 6.491552
## 0.083653037 8.679390 0.6818844 7.072580
## 0.122509677 9.979465 0.5780608 8.139785
## 0.577047401 13.444956 0.4565212 10.949275
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.003116364.
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
## n= 186
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 186 42541.8100 64.03226
## 2) Artist_Follower< 0.069933 69 6966.6380 49.07246
## 4) fa_2sq>=0.04873442 44 1927.8860 43.65909
## 8) Artist_Follower< 0.0063065 24 493.3333 39.33333 *
## 9) Artist_Follower>=0.0063065 20 446.5500 48.85000 *
## 5) fa_2sq< 0.04873442 25 1480.0000 58.60000
## 10) Track_Duration_ms>=322237 11 247.6364 53.81818 *
## 11) Track_Duration_ms< 322237 14 783.2143 62.35714 *
## 3) Artist_Follower>=0.069933 117 11026.5300 72.85470
## 6) Artist_Follower< 0.538149 59 2850.6780 66.23729
## 12) Track_Duration_ms>=327871 16 283.0000 59.25000 *
## 13) Track_Duration_ms< 327871 43 1495.8600 68.83721
## 26) Track_Popularity< 66 33 762.9091 66.81818 *
## 27) Track_Popularity>=66 10 154.5000 75.50000 *
## 7) Artist_Follower>=0.538149 58 2964.0690 79.58621
## 14) Artist_Follower< 2.122275 41 1084.0490 76.26829
## 28) Track_Popularity< 59 25 470.9600 73.96000 *
## 29) Track_Popularity>=59 16 271.7500 79.87500 *
## 15) Artist_Follower>=2.122275 17 340.1176 87.58824 *
rpart.plot(rpart.best.cv)
RMSE(predict(rpart.best.cv, newdata = df), df$Artist_Popularity) # corresponds exactly to the pruned tree solution
## [1] 4.782344
R2(predict(rpart.best.cv, newdata = df), df$Artist_Popularity) # corresponds exactly to the pruned tree solution
## [1] 0.9000049
# DT CV on split data
set.seed(42)
idx.train <- createDataPartition(y = df$Artist_Popularity, p = 0.75, list = FALSE)
train.data <- df[idx.train, ]
test.data <- df[-idx.train,]
caret.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Artist_Popularity ~ .,
data = train.data,
method = "rpart",
trControl = caret.control,
tuneLength = 15)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpart.cv
## CART
##
## 142 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 128, 127, 128, 128, 127, 128, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.00000000 6.262619 0.8356051 4.945561
## 0.03893783 7.948876 0.7357777 6.587601
## 0.07787565 8.733659 0.6826962 7.209211
## 0.11681348 9.340591 0.6418864 7.682783
## 0.15575130 10.167727 0.5758572 8.268144
## 0.19468913 11.032049 0.5094994 9.065660
## 0.23362695 11.336369 0.4881129 9.306610
## 0.27256478 11.336369 0.4881129 9.306610
## 0.31150261 11.336369 0.4881129 9.306610
## 0.35044043 11.336369 0.4881129 9.306610
## 0.38937826 11.336369 0.4881129 9.306610
## 0.42831608 11.336369 0.4881129 9.306610
## 0.46725391 11.336369 0.4881129 9.306610
## 0.50619173 11.336369 0.4881129 9.306610
## 0.54512956 13.737829 0.3936640 11.451762
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
## n= 142
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 142 33159.33000 64.23239
## 2) Artist_Follower< 0.2779545 84 11318.29000 54.85714
## 4) Artist_Follower< 0.008695 24 756.62500 40.87500
## 8) Track_Popularity< 48.5 15 399.73330 38.53333 *
## 9) Track_Popularity>=48.5 9 137.55560 44.77778 *
## 5) Artist_Follower>=0.008695 60 3992.85000 60.45000
## 10) pc_1< 1.01413 40 1397.97500 56.52500
## 20) Artist_Follower< 0.0946055 26 814.96150 54.03846
## 40) fa_2< -0.2211121 10 130.50000 49.50000 *
## 41) fa_2>=-0.2211121 16 349.75000 56.87500 *
## 21) Artist_Follower>=0.0946055 14 123.71430 61.14286 *
## 11) pc_1>=1.01413 20 746.20000 68.30000
## 22) Track_Popularity< 54.5 7 75.42857 63.71429 *
## 23) Track_Popularity>=54.5 13 444.30770 70.76923 *
## 3) Artist_Follower>=0.2779545 58 3764.91400 77.81034
## 6) Artist_Follower< 2.122275 46 1593.93500 74.84783
## 12) Track_Popularity< 68.5 37 1001.29700 73.27027
## 24) Artist_Follower< 0.5444075 11 250.72730 68.54545 *
## 25) Artist_Follower>=0.5444075 26 401.11540 75.26923
## 50) pc_2sq>=0.5148601 13 124.92310 73.07692 *
## 51) pc_2sq< 0.5148601 13 151.23080 77.46154 *
## 13) Track_Popularity>=68.5 9 122.00000 81.33333 *
## 7) Artist_Follower>=2.122275 12 219.66670 89.16667 *
rpart.plot(rpart.best.cv)
RMSE(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
## [1] 4.220621
R2(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
## [1] 0.9237157
performace_metrics[8,3] <- RMSE(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
performace_metrics[8,4] <- R2(predict(rpart.best.cv, newdata = train.data), train.data$Artist_Popularity)
RMSE(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
## [1] 7.001912
R2(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
## [1] 0.808911
performace_metrics[8,5] <- RMSE(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
performace_metrics[8,6] <- R2(predict(rpart.best.cv, newdata = test.data), test.data$Artist_Popularity)
#########################################################################
library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(42)
rf <-randomForest(Artist_Popularity~fa_1+fa_2+fa_3+fa_4+pc_1+pc_2+Artist_Follower+days_release_orig
+Track_Popularity+Track_Duration_ms+fa_2sq+pc_2sq+Artist_Followersq, data=df, cp = 0, method = 'anova')
summary(rf)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 186 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 186 -none- numeric
## importance 13 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 186 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
imp <- importance(rf)
ind <- order(imp, decreasing=T)
imp[ind,]
## Artist_Follower Artist_Followersq fa_2 Track_Duration_ms
## 12415.9483 11284.4988 3553.0032 3522.9291
## pc_2 Track_Popularity fa_2sq pc_2sq
## 3490.9451 1639.1507 1388.5078 979.8766
## pc_1 days_release_orig fa_4 fa_3
## 896.7161 769.5573 713.0933 706.7977
## fa_1
## 455.5893
rf_pred <- predict(rf, df, type="class")
n <- nrow(df)
1-sum(residuals(rf)^2)/((n-1)*var(df$Artist_Popularity))
## [1] 1
RMSE(predict(rf, newdata = df), df$Artist_Popularity)
## [1] 2.114803
R2(predict(rf, newdata = df), df$Artist_Popularity)
## [1] 0.9825223
performace_metrics[9,1] <- RMSE(predict(rf, newdata = df), df$Artist_Popularity)
performace_metrics[9,2] <- R2(predict(rf, newdata = df), df$Artist_Popularity)
# RMSE(predict(rf, newdata = train.data), train.data$Artist_Popularity)
# R2(predict(rf, newdata = train.data), train.data$Artist_Popularity)
# RMSE(predict(rf, newdata = test.data), test.data$Artist_Popularity)
# R2(predict(rf, newdata = test.data), test.data$Artist_Popularity)
#########################################################################
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library("mlr")
## Loading required package: ParamHelpers
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
##
## Attaching package: 'mlr'
## The following objects are masked _by_ '.GlobalEnv':
##
## f1, rmse
## The following object is masked from 'package:e1071':
##
## impute
## The following object is masked from 'package:caret':
##
## train
library("parallel")
rf_train <- as.data.table(train)
rf_test <- as.data.table(test)
task <- makeRegrTask(data = train, target = "Artist_Popularity")
rf_learner <- makeLearner("regr.randomForest",
predict.type = "response")
rf.parms <- makeParamSet(
# The recommendation for mtry by Breiman is squareroot number of columns
makeDiscreteParam("mtry", values = c(2,3,4,5,6)), # Number of features selected at each node, smaller -> faster
makeDiscreteParam("sampsize", values = c(30, 50, 70)), # bootstrap sample size, smaller -> faster
makeDiscreteParam("ntree", values = c(10,30,50,100, 500, 1000)) # Number of tree, smaller -> faster
)
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = FALSE)
library("parallelMap")
parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
#tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
# par.set = rf.parms, control = tuneControl, measures = mlr::rmse)
# https://stackoverflow.com/questions/51333410/mlr-why-does-reproducibility-of-hyperparameter-tuning-fail-using-parallelizatio
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
par.set = rf.parms, control = tuneControl, measures = mlr::rmse)
})
parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x
## $mtry
## [1] 5
##
## $sampsize
## [1] 70
##
## $ntree
## [1] 1000
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = TRUE)
tuning_results$data
tapply(tuning_results$data$rmse.test.rmse, INDEX = c(tuning_results$data$mtry), mean)
## 2 3 4 5 6
## 6.276888 5.882133 5.681335 5.635646 5.688866
rf_tuned <- setHyperPars(rf_learner, par.vals = tuning$x)
rf_model <- mlr::train(rf_tuned, task = task)
rf_pred_train <- predict(rf_model, newdata = train, type = "response")
rf_pred_test <- predict(rf_model, newdata = test, type = "response")
# rf_pred_train$data$response
# rf_pred_test$data$response
RMSE(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
## [1] 3.370124
R2(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
## [1] 0.9581589
performace_metrics[9,3] <- RMSE(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
performace_metrics[9,4] <- R2(predict(rf_model, newdata = train, type = "response")$data$response, train$Artist_Popularity)
RMSE(predict(rf_model, newdata = test, type = "class")$data$response, test$Artist_Popularity)
## [1] 4.740843
R2(predict(rf_model, newdata = test, type = "class")$data$response, test$Artist_Popularity)
## [1] 0.9040584
performace_metrics[9,5] <- RMSE(predict(rf_model, newdata = test, type = "response")$data$response, test$Artist_Popularity)
performace_metrics[9,6] <- R2(predict(rf_model, newdata = test, type = "response")$data$response, test$Artist_Popularity)
library("MASS")
library("nnet")
##
## Attaching package: 'nnet'
## The following object is masked from 'package:mgcv':
##
## multinom
df_z <- scale(as.matrix(df))
train_z <- as.data.frame(df_z[idx.train, ])
test_z <- as.data.frame(df_z[-idx.train,])
mean_mat <- matrix(, nrow = dim(df)[1], ncol = dim(df)[2])
mean_mat[1, ] <- as.numeric(sapply(df, mean, na.rm = TRUE))
library("zoo")
mean_mat <- na.locf(mean_mat)
diag_sd <- diag(sapply(df, sd, na.rm = TRUE))
# Backtransformation scale to orginal
# as.matrix(df_z) %*% diag_sd + mean_mat
task <- makeRegrTask(data = train_z, target = "Artist_Popularity")
nnet <- makeLearner("regr.nnet", predict.type = "response", par.vals = list("trace" = FALSE, "maxit" = 1000,
"MaxNWts" = 80000))
nn.parms <- makeParamSet(
makeDiscreteParam("decay", values = c(0.0001,0.001, 0.01, 0.1)),
makeDiscreteParam("size", values = c(2,4,6,8,10,12,14)))
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = FALSE)
parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
# tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
# measures = mlr::rmse)
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
measures = mlr::rmse)
})
parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x # result with full data set was decay = 1e-04, size = 2, rmse.test.rmse = 0.4336486
## $decay
## [1] 0.01
##
## $size
## [1] 2
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = FALSE)
plotHyperParsEffect(tuning_results, x = "size", y = "rmse.test.rmse")
plotHyperParsEffect(tuning_results, x = "decay", y = "rmse.test.rmse")
nnet.tuned <- setHyperPars(nnet, par.vals = tuning$x)
nn_model <- mlr::train(nnet.tuned, task = task)
nn_pred_train <- predict(nn_model, newdata = train_z, type = "response")
nn_pred_test <- predict(nn_model, newdata = test_z, type = "response")
RMSE(predict(nn_model, newdata = train_z, type = "response")$data$response, train_z$Artist_Popularity)
## [1] 0.3410186
R2(predict(nn_model, newdata = train_z, type = "response")$data$response, train_z$Artist_Popularity)
## [1] 0.8854886
RMSE(predict(nn_model, newdata = test_z, type = "response")$data$response, test_z$Artist_Popularity)
## [1] 0.4563401
R2(predict(nn_model, newdata = test_z, type = "response")$data$response, test_z$Artist_Popularity)
## [1] 0.8231612
# Scale predictions back to original:
# as.matrix(train_z[, 11]) %*% diag_sd[11,11] + mean_mat[1,11] == train[,11]
RMSE(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
## [1] 5.171305
R2(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
## [,1]
## [1,] 0.8854886
performace_metrics[10,3] <- RMSE(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
performace_metrics[10,4] <- R2(as.matrix(nn_pred_train$data$response) %*% diag_sd[11,11] + mean_mat[1,11], train$Artist_Popularity)
RMSE(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
## [1] 6.920074
R2(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
## [,1]
## [1,] 0.8231612
performace_metrics[10,5] <- RMSE(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
performace_metrics[10,6] <- R2(as.matrix(nn_pred_test$data$response) %*% diag_sd[11,11] + mean_mat[1,11], test$Artist_Popularity)
round(performace_metrics, digits = 2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 6.91 0.79 6.84 0.80 9.28 0.63
## [2,] 7.24 0.77 7.21 0.78 8.19 0.70
## [3,] 7.03 0.78 6.94 0.79 7.70 0.73
## [4,] 7.04 0.78 6.94 0.79 8.06 0.71
## [5,] 3.37 0.95 3.32 0.95 53.30 0.22
## [6,] 5.51 0.87 5.59 0.87 5.93 0.84
## [7,] 3.06 0.96 3.11 0.96 8.09 0.77
## [8,] 4.46 0.91 4.22 0.92 7.00 0.81
## [9,] 2.11 0.98 3.37 0.96 4.74 0.90
## [10,] NA NA 5.17 0.89 6.92 0.82
# Classification
df$Charts <- as.factor(x_noout$Charts)
set.seed(42)
rpart <- rpart(data = df,Charts~.,method = 'class')
rpart.plot(rpart)
rf_pred <- predict(rpart, newdata = df, type = "class")
confMat <- table(df$Charts,rf_pred)
confMat # 0: not in charts, 1: in charts
## rf_pred
## 0 1
## 0 93 0
## 1 7 86
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9623656
# 0.9623656
par(mfrow=c(1,1))
printcp(rpart)
##
## Classification tree:
## rpart(formula = Charts ~ ., data = df, method = "class")
##
## Variables actually used in tree construction:
## [1] fa_2 pc_1 Track_Duration_ms
##
## Root node error: 93/186 = 0.5
##
## n= 186
##
## CP nsplit rel error xerror xstd
## 1 0.795699 0 1.000000 1.18280 0.072088
## 2 0.118280 1 0.204301 0.21505 0.045429
## 3 0.010753 2 0.086022 0.10753 0.033076
## 4 0.010000 3 0.075269 0.13978 0.037390
plotcp(rpart)
rpart$cptable
## CP nsplit rel error xerror xstd
## 1 0.79569892 0 1.00000000 1.1827957 0.07208812
## 2 0.11827957 1 0.20430108 0.2150538 0.04542863
## 3 0.01075269 2 0.08602151 0.1075269 0.03307630
## 4 0.01000000 3 0.07526882 0.1397849 0.03738999
best.cp <- rpart$cptable[which.min(rpart$cptable[,"xerror"]),"CP"]
best.cp
## [1] 0.01075269
set.seed(42)
rpart_pruned <- prune(rpart, cp=best.cp)
rpart.plot(rpart_pruned)
rf_pred <- predict(rpart_pruned, newdata = df, type = "class")
confMat <- table(df$Charts,rf_pred)
confMat # 0: not in charts, 1: in charts
## rf_pred
## 0 1
## 0 89 4
## 1 4 89
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9569892
# 0.9569892
library("caret")
train.data <- df[idx.train, ]
test.data <- df[-idx.train,]
caret.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(42)
rpart.cv <- caret::train(Charts ~ .,
data = train.data,
method = "rpart",
trControl = caret.control,
tuneLength = 15)
rpart.cv
## CART
##
## 142 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 127, 128, 127, 127, 128, 129, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0000000 0.8944567 0.7881329
## 0.0575693 0.8852503 0.7690778
## 0.1151386 0.8663126 0.7304763
## 0.1727079 0.8685348 0.7345800
## 0.2302772 0.8685348 0.7345800
## 0.2878465 0.8685348 0.7345800
## 0.3454158 0.8685348 0.7345800
## 0.4029851 0.8685348 0.7345800
## 0.4605544 0.8685348 0.7345800
## 0.5181237 0.8685348 0.7345800
## 0.5756930 0.8685348 0.7345800
## 0.6332623 0.8685348 0.7345800
## 0.6908316 0.8685348 0.7345800
## 0.7484009 0.8685348 0.7345800
## 0.8059701 0.6919536 0.3540940
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
set.seed(42)
rpart.best.cv <- rpart.cv$finalModel
rpart.best.cv
## n= 142
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 142 67 1 (0.47183099 0.52816901)
## 2) Track_Duration_ms>=257068 56 1 0 (0.98214286 0.01785714) *
## 3) Track_Duration_ms< 257068 86 12 1 (0.13953488 0.86046512)
## 6) fa_2< -0.220861 9 1 0 (0.88888889 0.11111111) *
## 7) fa_2>=-0.220861 77 4 1 (0.05194805 0.94805195)
## 14) pc_1< -0.7482784 7 3 0 (0.57142857 0.42857143) *
## 15) pc_1>=-0.7482784 70 0 1 (0.00000000 1.00000000) *
rpart.plot(rpart.best.cv)
rpart_pred <- predict(rpart.best.cv, newdata = train.data, type = "class")
confMat <- table(train.data$Charts,rpart_pred)
confMat # 0: not in charts, 1: in charts
## rpart_pred
## 0 1
## 0 67 0
## 1 5 70
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9647887
# 0.9507042
rpart_pred <- predict(rpart.best.cv, newdata = test.data, type = "class")
confMat <- table(test.data$Charts,rpart_pred)
confMat # 0: not in charts, 1: in charts
## rpart_pred
## 0 1
## 0 26 0
## 1 1 17
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9772727
# 0.9318182
#################################################
# Random forest
rf_train <- train.data
rf_test <- test.data
task <- makeClassifTask(data = rf_train, target = "Charts", positive = "1")
rf_learner <- makeLearner("classif.randomForest",
predict.type = "prob", # prediction type needs to be specified for the learner
par.vals = list("replace" = TRUE, "importance" = FALSE))
rf.parms <- makeParamSet(
# The recommendation for mtry by Breiman is squareroot number of columns
makeDiscreteParam("mtry", values = c(2,3,4,5,6)), # Number of features selected at each node, smaller -> faster
makeDiscreteParam("sampsize", values = c(30, 50, 70)), # bootstrap sample size, smaller -> faster
makeDiscreteParam("ntree", values = c(10,30,50,100, 500, 1000)) # Number of tree, smaller -> faster
)
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = TRUE)
parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(rf_learner, task = task, resampling = rdesc,
par.set = rf.parms, control = tuneControl, measures = mlr::auc)
})
parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x
## $mtry
## [1] 2
##
## $sampsize
## [1] 70
##
## $ntree
## [1] 30
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = TRUE)
tuning_results$data
tapply(tuning_results$data$auc.test.mean, INDEX = c(tuning_results$data$mtry), mean)
## 2 3 4 5 6
## 0.9959381 0.9951058 0.9951486 0.9935470 0.9945442
rf_tuned <- setHyperPars(rf_learner, par.vals = tuning$x)
rf_model <- mlr::train(rf_tuned, task = task)
rf_pred <- predict(rf_model, newdata = train.data, type = "class")
# rf_pred$data$response
confMat <- table(train.data$Charts,rf_pred$data$response)
confMat # 0: not in charts, 1: in charts
##
## 0 1
## 0 67 0
## 1 0 75
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 1
# 1
rf_pred <- predict(rf_model, newdata = test.data, type = "class")
# rf_pred$data$response
confMat <- table(test.data$Charts,rf_pred$data$response)
confMat # 0: not in charts, 1: in charts
##
## 0 1
## 0 26 0
## 1 0 18
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 1
# 1
# Neural net classification
df$Charts <- x_noout$Charts
train_z$Charts <- df[idx.train, "Charts"]
test_z$Charts <- df[-idx.train, "Charts"]
task <- makeClassifTask(data = train_z, target = "Charts", positive = "1")
nnet <- makeLearner("classif.nnet", predict.type = "prob", par.vals = list("trace" = FALSE, "maxit" = 1000,
"MaxNWts" = 80000))
nn.parms <- makeParamSet(
makeDiscreteParam("decay", values = c(0.0001,0.001, 0.01, 0.1)),
makeDiscreteParam("size", values = c(2,4,6,8,10,12,14)))
tuneControl <- makeTuneControlGrid(resolution = 3, tune.threshold = FALSE)
rdesc <- makeResampleDesc(method = "CV", iters = 5, stratify = TRUE)
parallelStartSocket(4, level = "mlr.tuneParams")
## Starting parallelization in mode=socket with cpus=4.
suppressMessages({
set.seed(123456, "L'Ecuyer-CMRG")
clusterSetRNGStream(iseed = 123456)
tuning <- tuneParams(nnet, task = task, resampling = rdesc, par.set = nn.parms, control = tuneControl,
measures = mlr::auc)
})
parallelStop()
## Stopped parallelization. All cleaned up.
tuning$x # result with full data set was decay = 1e-04, size = 2, rmse.test.rmse = 0.4336486
## $decay
## [1] 0.1
##
## $size
## [1] 12
tuning_results <- generateHyperParsEffectData(tuning, partial.dep = FALSE)
plotHyperParsEffect(tuning_results, x = "size", y = "auc.test.mean")
plotHyperParsEffect(tuning_results, x = "decay", y = "auc.test.mean")
nnet.tuned <- setHyperPars(nnet, par.vals = tuning$x)
nn_model <- mlr::train(nnet.tuned, task = task)
model <- mlr::train(nnet.tuned, task = task)
nn_pred_train <- predict(model, newdata = train_z, type = "class")
nn_pred_test <- predict(model, newdata = test_z, type = "class")
confMat <- table(train_z$Charts,nn_pred_train$data$response)
confMat # 0: not in charts, 1: in charts
##
## 0 1
## 0 67 0
## 1 0 75
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 1
# 1
confMat <- table(test_z$Charts,nn_pred_test$data$response)
confMat # 0: not in charts, 1: in charts
##
## 0 1
## 0 25 1
## 1 0 18
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.9772727
# 1
```